Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(425)

Side by Side Diff: tools/lexer_generator/transition_keys.py

Issue 169523003: Experimental parser: split and rename some files (Closed) Base URL: https://v8.googlecode.com/svn/branches/experimental/parser
Patch Set: Created 6 years, 10 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« no previous file with comments | « tools/lexer_generator/transition_key.py ('k') | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
(Empty)
1 # Copyright 2013 the V8 project authors. All rights reserved.
2 # Redistribution and use in source and binary forms, with or without
3 # modification, are permitted provided that the following conditions are
4 # met:
5 #
6 # * Redistributions of source code must retain the above copyright
7 # notice, this list of conditions and the following disclaimer.
8 # * Redistributions in binary form must reproduce the above
9 # copyright notice, this list of conditions and the following
10 # disclaimer in the documentation and/or other materials provided
11 # with the distribution.
12 # * Neither the name of Google Inc. nor the names of its
13 # contributors may be used to endorse or promote products derived
14 # from this software without specific prior written permission.
15 #
16 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17 # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18 # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19 # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20 # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21 # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22 # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27
28 from itertools import chain
29 from encoding import KeyEncoding
30 from action import Term
31
32 class TransitionKey(object):
33 '''Represents a transition from a state in DFA or NFA to another state.
34
35 A transition key has a list of character ranges and a list of class ranges
36 (e.g., "whitespace"), defining for which characters the transition
37 happens. When we generate code based on the transition key, the character
38 ranges generate simple checks and the class ranges generate more complicated
39 conditions, e.g., function calls.'''
40
41 __cached_keys = {
42 'no_encoding' : {},
43 'latin1' : {},
44 'utf8' : {},
45 'utf16' : {},
46 }
47
48 @staticmethod
49 def __cached_key(encoding, name, components_getter):
50 encoding_name = encoding.name() if encoding else 'no_encoding'
51 cache = TransitionKey.__cached_keys[encoding_name]
52 if not name in cache:
53 cache[name] = TransitionKey(encoding, components_getter())
54 return cache[name]
55
56 @staticmethod
57 def epsilon():
58 '''Returns a TransitionKey for the epsilon (empty) transition.'''
59 return TransitionKey.__cached_key(None, 'epsilon',
60 lambda : Term("EPSILON_KEY"))
61
62 @staticmethod
63 def omega():
64 '''Always matches.'''
65 return TransitionKey.__cached_key(None, 'omega', lambda : Term("OMEGA_KEY"))
66
67 @staticmethod
68 def any(encoding):
69 '''Returns a TransitionKey which matches any encoded character.'''
70 return TransitionKey.__cached_key(encoding, 'any',
71 lambda : encoding.all_components_iter())
72
73 @staticmethod
74 def single_char(encoding, char):
75 '''Returns a TransitionKey for a single-character transition.'''
76 return TransitionKey.range(encoding, char, char)
77
78 @staticmethod
79 def range(encoding, a, b):
80 '''Returns a TransitionKey for a single-character transition.'''
81 return TransitionKey(encoding, encoding.numeric_range_term(a, b))
82
83 @staticmethod
84 def unique(term): # TODO(dcarney): rename
85 '''Returns a unique TransitionKey for the given name (and creates it if it
86 doesn't exist yet). The TransitionKey won't have any real character range,
87 but a newly-created "mock" character range which is separate from all other
88 character ranges.'''
89 return TransitionKey(None, Term("TERM_KEY", term))
90
91 @staticmethod
92 def __process_term(encoding, term, components, key_map):
93 key = term.name()
94 args = term.args()
95 if key == 'RANGE':
96 components.append(encoding.numeric_range_term(args[0], args[1]))
97 elif key == 'LITERAL':
98 components += map(lambda x : encoding.numeric_range_term(x, x), args)
99 elif key == 'CAT':
100 for x in args:
101 TransitionKey.__process_term(encoding, x, components, key_map)
102 elif key == 'CHARACTER_CLASS':
103 class_name = args[0]
104 if encoding.named_range(class_name):
105 c = encoding.named_range(class_name)
106 if class_name in key_map:
107 assert key_map[class_name] == TransitionKey(encoding, c)
108 components.append(c)
109 elif encoding.predefined_range_iter(class_name):
110 cs = list(encoding.predefined_range_iter(class_name))
111 if class_name in key_map:
112 assert key_map[class_name] == TransitionKey(encoding, cs)
113 components += cs
114 elif class_name in key_map:
115 components += key_map[class_name].__flatten()
116 else:
117 raise Exception('unknown character class [%s]' % args[0])
118 else:
119 raise Exception('bad key [%s]' % key)
120
121 @staticmethod
122 def character_class(encoding, term, key_map):
123 '''Processes 'term' (a representation of a character class in the rule
124 file), and constructs a TransitionKey based on it. 'key_map' contains
125 already constructed aliases for character classes (they can be used in the
126 new character class). It is a map from strings (character class names) to
127 TransitionKeys. For example, graph might represent the character class
128 [a-z:digit:] where 'digit' is a previously constructed character class found
129 in "key_map".'''
130 components = []
131 assert term.name() == 'CLASS' or term.name() == 'NOT_CLASS'
132 invert = term.name() == 'NOT_CLASS'
133 assert len(term.args()) == 1
134 TransitionKey.__process_term(encoding, term.args()[0], components, key_map)
135 key = TransitionKey.__key_from_components(encoding, invert, components)
136 assert key != None, "not invertible %s " % str(term)
137 return key
138
139 def matches_char(self, char):
140 'this is just for simple lexer testing and is incomplete'
141 char = ord(char)
142 assert 0 <= char and char < 128
143 for c in self.__flatten():
144 if c.name() == 'NUMERIC_RANGE_KEY':
145 r = c.args()
146 if r[0] <= char and char <= r[1]:
147 return True
148 return False
149
150 # (disjoint, subset, advance_left, advance_right)
151 __is_superset_of_key_helper = (
152 (True, True, False, False), # -5 : error
153 (True, True, False, False), # -4 : error
154 (False, True, False, True), # -3 :
155 (False, False, True, False), # -2 :
156 (False, False, True, False), # -1 :
157 (False, True, True, True), # 0 :
158 (True, False, False, True), # 1 :
159 (True, False, False, True), # 2 :
160 (True, True, False, False), # 3 : error
161 (False, True, False, True), # 4 :
162 (True, True, False, False), # 5 : error
163 )
164
165 def is_superset_of_key(self, key):
166 '''Returns true if 'key' is a sub-key of this TransitionKey.
167 must be called on a key that is either a subset or disjoint'''
168 helper = TransitionKey.__is_superset_of_key_helper
169 (left, right) = (self.__flatten(), key.__flatten())
170 (disjoint, subset, advance_left, advance_right) = (False, False, True, True)
171 (right_exhausted, left_exhausted) = (False, False)
172 while advance_left or advance_right:
173 if advance_right:
174 try:
175 r = right.next()
176 except StopIteration:
177 right_exhausted = True
178 if advance_left:
179 try:
180 l = left.next()
181 except StopIteration:
182 left_exhausted = True
183 if right_exhausted or left_exhausted:
184 break
185 c = TransitionKey.__component_compare(l, r)
186 (d, s, advance_left, advance_right) = helper[c + 5]
187 disjoint |= d
188 subset |= s
189 if not right_exhausted:
190 disjoint = True
191 if disjoint and subset:
192 raise Exception('not a subset and not disjoint')
193 return subset
194
195 @staticmethod
196 def compare(self, other):
197 left = list(self.__flatten())
198 right = list(other.__flatten())
199 offset = 0
200 while offset < len(left) and offset < len(right):
201 c = TransitionKey.__component_compare(left[offset], right[offset])
202 if c != 0:
203 return c
204 offset += 1
205 return TransitionKey.__cmp(len(left), len(right))
206
207 def __cmp__(self, other):
208 return TransitionKey.compare(self, other)
209
210 def __hash__(self):
211 return hash(self.__term)
212
213 def __ne__(self, other):
214 return not self == other
215
216 def __eq__(self, other):
217 return isinstance(other, TransitionKey) and self.__term == other.__term
218
219 def range_iter(self, encoding):
220 for c in self.__flatten():
221 if c.name() == 'NUMERIC_RANGE_KEY':
222 yield ('PRIMARY_RANGE', (c.args()[0], c.args()[1]))
223 elif c.name() == 'NAMED_RANGE_KEY':
224 yield ('CLASS', c.args()[0])
225 elif c.name() == 'TERM_KEY':
226 yield ('UNIQUE', c.args()[0])
227 elif c.name() == 'OMEGA_KEY':
228 yield ('OMEGA', ())
229 else:
230 assert False, 'unimplemented %s' % c
231
232 @staticmethod
233 def __component_str(encoding, component):
234 if component.name() == 'TERM_KEY':
235 return component.args()[0]
236 elif component.name() == 'NAMED_RANGE_KEY':
237 return component.args()[0]
238 elif component.name() == 'EPSILON_KEY':
239 return 'epsilon'
240 elif component.name() == 'OMEGA_KEY':
241 return 'omega'
242 elif component.name() == 'NUMERIC_RANGE_KEY':
243 r = component.args()
244 to_str = lambda x: KeyEncoding.to_str(encoding, x)
245 if r[0] == r[1]:
246 return "'%s'" % to_str(r[0])
247 return "['%s'-'%s']" % (to_str(r[0]), to_str(r[1]))
248 raise Exception('unprintable %s' % component)
249
250 def __flatten(self):
251 return self.__flatten_components([self.__term])
252
253 @staticmethod
254 def __flatten_components(components):
255 for component in components:
256 if component.name() != 'COMPOSITE_KEY':
257 yield component
258 else:
259 for arg in component.args():
260 yield arg
261
262 __component_name_order = {
263 'EPSILON_KEY' : 0,
264 'NUMERIC_RANGE_KEY' : 1,
265 'NAMED_RANGE_KEY' : 2,
266 'TERM_KEY' : 3,
267 'OMEGA_KEY' : 4
268 }
269
270 @staticmethod
271 def __cmp(a, b):
272 'wraps standard cmp function to return correct results for components'
273 c = cmp(a, b)
274 return 0 if c == 0 else (-1 if c < 0 else 1)
275
276 @staticmethod
277 def __component_name_compare(a, b):
278 return TransitionKey.__cmp(TransitionKey.__component_name_order[a],
279 TransitionKey.__component_name_order[b])
280
281 @staticmethod
282 def __component_compare(a, b):
283 '''component-wise compare function, returns the following values when
284 comparing non identical numerical ranges:
285 non-overlapping : -2 -- a0 <= a1 < b0 <= b1
286 b subset of a : -3 -- a0 < b0 <= b1 <= a1
287 a subset of b : -4 -- a0 == b0 and a1 < b1
288 a overlaps to left : -5 -- a0 < b0 and b0 <= a1 < b1
289 otherwise a value in [-1, 1] is returned'''
290 if a.name() != b.name():
291 return TransitionKey.__component_name_compare(a.name(), b.name())
292 if a.name() != 'NUMERIC_RANGE_KEY':
293 return TransitionKey.__cmp(str(a), str(b))
294 (a, b) = (a.args(), b.args())
295 c0 = TransitionKey.__cmp(a[0], b[0])
296 if c0 == 0:
297 return 4 * TransitionKey.__cmp(a[1], b[1]) # either == or a subset
298 sign = -1
299 if c0 > 0:
300 (a, b, sign) = (b, a, 1) # normalize ordering so that a0 < b0
301 assert a[0] < b[0]
302 if b[1] <= a[1]: # subset
303 return 3 * sign
304 if a[1] < b[0]: # non overlap
305 return 2 * sign
306 return 5 * sign # partial overlap
307
308 @staticmethod
309 def __is_composable(term):
310 return term.name() != 'EPSILON_KEY' and term.name() != 'OMEGA_KEY'
311
312 @staticmethod
313 def __construct(encoding, components):
314 if isinstance(components, Term):
315 components = [components]
316 is_unique = False
317 acc = []
318 last = Term.empty_term()
319 for current in TransitionKey.__flatten_components(components):
320 name = current.name()
321 # verify arguments
322 if name == 'EPSILON_KEY' or name == 'OMEGA_KEY' or name == 'TERM_KEY':
323 pass
324 elif name == 'NUMERIC_RANGE_KEY':
325 assert encoding.is_primary_range(current.args())
326 if last.name() == 'NUMERIC_RANGE_KEY':
327 assert last.args()[1] + 1 < current.args()[0], 'ranges must be merged'
328 elif name == 'NAMED_RANGE_KEY':
329 assert encoding.named_range(current.args()[0])
330 else:
331 raise Exception('illegal component %s' % str(current))
332 # verify ordering, composability
333 if last:
334 assert TransitionKey.__is_composable(current), 'cannot compose'
335 if len(acc) == 1:
336 assert TransitionKey.__is_composable(last), 'cannot compose'
337 c = TransitionKey.__component_compare(last, current)
338 assert c == -1 or c == -2, 'bad order %s %s' % (str(last), str(current))
339 acc.append(current)
340 last = current
341 assert acc, "must have components"
342 return acc[0] if len(acc) == 1 else Term('COMPOSITE_KEY', *acc)
343
344 def __init__(self, encoding, components):
345 self.__term = TransitionKey.__construct(encoding, components)
346 self.__cached_hash = None
347
348 def to_string(self, encoding):
349 return ', '.join(map(lambda x : TransitionKey.__component_str(encoding, x),
350 self.__flatten()))
351
352 def __str__(self):
353 return self.to_string(None)
354
355 @staticmethod
356 def __disjoint_keys(encoding, range_map):
357 '''Takes a set of possibly overlapping ranges, returns a list of ranges
358 which don't overlap and which cover the same points as the original
359 set. range_map is a map from lower bounds to a list of upper bounds.'''
360 sort = lambda x : sorted(set(x))
361 range_map = sorted(map(lambda (k, v): (k, sort(v)), range_map.items()))
362 upper_bound = encoding.upper_bound() + 1
363 for i in range(len(range_map)):
364 (left, left_values) = range_map[i]
365 next = range_map[i + 1][0] if i != len(range_map) - 1 else upper_bound
366 to_push = []
367 for v in left_values:
368 assert left <= next
369 if v >= next:
370 if not to_push and left < next:
371 yield (left, next - 1)
372 to_push.append(v)
373 else:
374 yield (left, v)
375 left = v + 1
376 if to_push:
377 current = range_map[i + 1]
378 range_map[i + 1] = (current[0], sort(current[1] + to_push))
379
380 @staticmethod
381 def __merge_ranges(encoding, ranges):
382 last = None
383 for r in ranges:
384 if last == None:
385 last = r
386 elif last[1] + 1 == r[0]:
387 last = (last[0], r[1])
388 else:
389 yield last
390 last = r
391 if last != None:
392 yield last
393
394 @staticmethod
395 def __flatten_keys(keys):
396 return chain(*map(lambda x : x.__flatten(), keys))
397
398 @staticmethod
399 def __disjoint_components_from_keys(encoding, keys, merge_ranges = False):
400 return TransitionKey.__disjoint_components(
401 encoding, TransitionKey.__flatten_keys(keys), merge_ranges)
402
403 @staticmethod
404 def __disjoint_components(encoding, components, merge_ranges):
405 range_map = {}
406 other_keys = set([])
407 for x in components:
408 if x.name() != 'NUMERIC_RANGE_KEY':
409 other_keys.add(x)
410 continue
411 (start, end) = x.args()
412 if not start in range_map:
413 range_map[start] = []
414 range_map[start].append(end)
415 ranges = TransitionKey.__disjoint_keys(encoding, range_map)
416 if merge_ranges:
417 ranges = TransitionKey.__merge_ranges(encoding, ranges)
418 other_keys = sorted(other_keys, cmp=TransitionKey.__component_compare)
419 range_terms = map(
420 lambda x : encoding.numeric_range_term(x[0], x[1]), ranges)
421 return chain(iter(range_terms), iter(other_keys))
422
423 @staticmethod
424 def __key_from_components(encoding, invert, components):
425 components = TransitionKey.__disjoint_components(encoding, components, True)
426 if invert:
427 components = TransitionKey.__invert_components(encoding, components)
428 return None if not components else TransitionKey(encoding, components)
429
430 @staticmethod
431 def disjoint_keys(encoding, keys):
432 '''Takes a set of possibly overlapping TransitionKeys, returns a list of
433 TransitionKeys which don't overlap and whose union is the same as the union
434 of the original key_set. In addition, TransitionKeys are not merged, only
435 split.
436
437 For example, if key_set contains two TransitionKeys for ranges [1-10] and
438 [5-15], disjoint_keys returns a set of three TransitionKeys: [1-4], [5-10],
439 [11-16].'''
440 return map(lambda x : TransitionKey(encoding, x),
441 TransitionKey.__disjoint_components_from_keys(encoding, keys))
442
443 @staticmethod
444 def inverse_key(encoding, keys):
445 '''Returns a TransitionKey which matches represents the inverse of the union
446 of 'keys'. The TransitionKeys contain a set of character ranges and a set
447 of classes. The character ranges are inverted in relation to the latin_1
448 character range, and the character classes are inverted in relation to all
449 character classes in __class_bounds.'''
450 return TransitionKey.__key_from_components(
451 encoding, True, TransitionKey.__flatten_keys(keys))
452
453 @staticmethod
454 def merged_key(encoding, keys):
455 return TransitionKey(encoding,
456 TransitionKey.__disjoint_components_from_keys(encoding, keys, True))
457
458 @staticmethod
459 def __invert_components(encoding, components):
460 key = lambda x, y: encoding.numeric_range_term(x, y)
461 last = None
462 classes = set(encoding.named_range_value_iter())
463 for c in components:
464 if c in classes:
465 classes.remove(c)
466 continue
467 assert c.name() == 'NUMERIC_RANGE_KEY', 'unencoded key not invertible'
468 r = c.args()
469 assert encoding.is_primary_range(r)
470 if last == None:
471 if r[0] != encoding.lower_bound():
472 yield key(encoding.lower_bound(), r[0] - 1)
473 elif last[1] + 1 < r[0]:
474 yield key(last[1] + 1, r[0] - 1)
475 last = r
476 upper_bound = encoding.primary_range()[1]
477 if last == None:
478 r = encoding.primary_range()
479 yield key(r[0], r[1])
480 elif last[1] < upper_bound:
481 yield key(last[1] + 1, upper_bound)
482 for c in sorted(classes, TransitionKey.__component_compare):
483 yield c
OLDNEW
« no previous file with comments | « tools/lexer_generator/transition_key.py ('k') | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698