tools/lexer_generator/transition_keys.py - Issue 63943005: Experimental lexer generator: refactor(TransitionKey) += comments.

Side by Side Diff: tools/lexer_generator/transition_keys.py

Issue 63943005: Experimental lexer generator: refactor(TransitionKey) += comments. (Closed) Base URL: https://v8.googlecode.com/svn/branches/experimental/parser

Patch Set: moar comments Created 7 years, 1 month ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 # Copyright 2013 the V8 project authors. All rights reserved.	1 # Copyright 2013 the V8 project authors. All rights reserved.

2 # Redistribution and use in source and binary forms, with or without	2 # Redistribution and use in source and binary forms, with or without

3 # modification, are permitted provided that the following conditions are	3 # modification, are permitted provided that the following conditions are

4 # met:	4 # met:

5 #	5 #

6 # * Redistributions of source code must retain the above copyright	6 # * Redistributions of source code must retain the above copyright

7 # notice, this list of conditions and the following disclaimer.	7 # notice, this list of conditions and the following disclaimer.

8 # * Redistributions in binary form must reproduce the above	8 # * Redistributions in binary form must reproduce the above

9 # copyright notice, this list of conditions and the following	9 # copyright notice, this list of conditions and the following

10 # disclaimer in the documentation and/or other materials provided	10 # disclaimer in the documentation and/or other materials provided

(...skipping 10 matching lines...) Expand all Loading...
21 # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT	21 # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT

22 # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,	22 # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,

23 # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY	23 # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY

24 # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT	24 # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT

25 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE	25 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE

26 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.	26 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

27	27

28 from string import printable	28 from string import printable

29	29

30 class TransitionKey:	30 class TransitionKey:

	31 '''Represents a transition from a state in DFA or NFA to another state.

	32

	33 A transition key has a list of character ranges and a list of class ranges

	34 (e.g., "whitespace"), defining for which characters the transition

	35 happens. When we generate code based on the transition key, the character

	36 ranges generate simple checks and the class ranges generate more complicated

	37 conditions, e.g., function calls.'''

31	38

32 __class_bounds = {	39 __class_bounds = {

33 'latin_1' : (1, 255),	40 'latin_1' : (1, 255),

34 # These are not real ranges; they just need to be separate from any real	41 # These are not real ranges; they just need to be separate from any real

35 # ranges.	42 # ranges.

36 'whitespace' : (256, 256),	43 'whitespace' : (256, 256),

37 'literal' : (257, 257),	44 'literal' : (257, 257),

38 'eos' : (258, 258),	45 'eos' : (258, 258),

39 'zero' : (259, 259),	46 'zero' : (259, 259),

40 }	47 }

41 __lower_bound = 1	48 __lower_bound = 1

42 __upper_bound = reduce(lambda acc, (k, v): max(acc, v[1]), __class_bounds.item s(), 0)	49 __upper_bound = max(__class_bounds.values(), key=lambda item: item[1])[1]

43	50

44 __cached_keys = {}	51 __cached_keys = {}

45	52

46 __unique_key_counter = -1	53 __unique_key_counter = -1

47	54

48 @staticmethod	55 @staticmethod

49 def __in_latin_1(char):	56 def __in_latin_1(char):

50 bound = TransitionKey.__class_bounds['latin_1']	57 bound = TransitionKey.__class_bounds['latin_1']

51 return (bound[0] <= char and char <= bound[1])	58 return (bound[0] <= char and char <= bound[1])

52	59

(...skipping 18 matching lines...) Expand all Loading...
71 r_is_class = TransitionKey.__is_class_range(r)	78 r_is_class = TransitionKey.__is_class_range(r)

72 # Assert that the ranges are in order.	79 # Assert that the ranges are in order.

73 if last != None and check_merged:	80 if last != None and check_merged:

74 assert last[1] + 1 < r[0] or r_is_class	81 assert last[1] + 1 < r[0] or r_is_class

75 if not TransitionKey.__in_latin_1(r[0]):	82 if not TransitionKey.__in_latin_1(r[0]):

76 assert r_is_class	83 assert r_is_class

77 if not TransitionKey.__in_latin_1(r[1]):	84 if not TransitionKey.__in_latin_1(r[1]):

78 assert r_is_class	85 assert r_is_class

79 last = r	86 last = r

80	87

81 @staticmethod

82 def __create(ranges, name = None):

83 if not ranges:

84 assert name == 'epsilon'

85 assert not name in TransitionKey.__cached_keys

86 else:

87 TransitionKey.__verify_ranges(ranges, True)

88 key = TransitionKey()

89 key.__ranges = tuple(ranges) # immutable

90 key.__name = name

91 key.__cached_hash = None

92 return key

93

94 def __is_unique(self):	88 def __is_unique(self):

95 return len(self.__ranges) == 1 and self.__is_unique_range(self.__ranges[0])	89 return len(self.__ranges) == 1 and self.__is_unique_range(self.__ranges[0])

96	90

97 @staticmethod	91 @staticmethod

98 def __cached_key(name, bounds_getter):	92 def __cached_key(name, bounds_getter):

99 if not name in TransitionKey.__cached_keys:	93 if not name in TransitionKey.__cached_keys:

100 bounds = bounds_getter(name)	94 bounds = bounds_getter(name)

101 TransitionKey.__cached_keys[name] = TransitionKey.__create(bounds, name)	95 TransitionKey.__cached_keys[name] = TransitionKey(bounds, name)

102 return TransitionKey.__cached_keys[name]	96 return TransitionKey.__cached_keys[name]

103	97

104 @staticmethod	98 @staticmethod

105 def epsilon():	99 def epsilon():

	100 '''Returns a TransitionKey for the epsilon (empty) transition.'''

106 return TransitionKey.__cached_key('epsilon', lambda name : [])	101 return TransitionKey.__cached_key('epsilon', lambda name : [])

107	102

108 @staticmethod	103 @staticmethod

109 def any():	104 def any():

	105 '''Returns a TransitionKey which matches any character.'''

110 def bounds_getter(name):	106 def bounds_getter(name):

111 bounds = TransitionKey.__class_bounds.values()	107 bounds = TransitionKey.__class_bounds.values()

112 bounds.sort()	108 bounds.sort()

113 return bounds	109 return bounds

114 return TransitionKey.__cached_key('any', bounds_getter)	110 return TransitionKey.__cached_key('any', bounds_getter)

115	111

116 @staticmethod	112 @staticmethod

117 def single_char(char):	113 def single_char(char):

	114 '''Returns a TransitionKey for a single-character transition.'''

118 char = ord(char)	115 char = ord(char)

119 assert TransitionKey.__in_latin_1(char)	116 assert TransitionKey.__in_latin_1(char)

120 return TransitionKey.__create([(char, char)])	117 return TransitionKey([(char, char)])

121	118

122 @staticmethod	119 @staticmethod

123 def unique(name):	120 def unique(name):

	121 '''Returns a unique TransitionKey for the given name (and creates it if it

	122 doesn't exist yet). The TransitionKey won't have any real character range,

	123 but a newly-created "mock" character range which is separate from all other

	124 character ranges.'''

124 def get_bounds(name):	125 def get_bounds(name):

125 bound = TransitionKey.__unique_key_counter	126 bound = TransitionKey.__unique_key_counter

126 TransitionKey.__unique_key_counter -= 1	127 TransitionKey.__unique_key_counter -= 1

127 return [(bound, bound)]	128 return [(bound, bound)]

128 name = '__' + name	129 name = '__' + name

129 return TransitionKey.__cached_key(name, get_bounds)	130 return TransitionKey.__cached_key(name, get_bounds)

130	131

131 @staticmethod	132 @staticmethod

132 def __process_graph(graph, ranges, key_map):	133 def __process_graph(graph, ranges, key_map):

133 key = graph[0]	134 key = graph[0]

(...skipping 16 matching lines...) Expand all Loading...
150 ranges.append(TransitionKey.__class_bounds['zero'])	151 ranges.append(TransitionKey.__class_bounds['zero'])

151 elif class_name in key_map:	152 elif class_name in key_map:

152 ranges += key_map[class_name].__ranges	153 ranges += key_map[class_name].__ranges

153 else:	154 else:

154 raise Exception('unknown character class [%s]' % graph[1])	155 raise Exception('unknown character class [%s]' % graph[1])

155 else:	156 else:

156 raise Exception('bad key [%s]' % key)	157 raise Exception('bad key [%s]' % key)

157	158

158 @staticmethod	159 @staticmethod

159 def character_class(graph, key_map):	160 def character_class(graph, key_map):

	161 '''Processes 'graph' (a representation of a character class in the rule

	162 file), and constructs a TransitionKey based on it. 'key_map' contains

	163 already constructed aliases for character classes (they can be used in the

	164 new character class). It is a map from strings (character class names) to

	165 TransitionKeys. For example, graph might represent the character class

	166 [a-z:digit:] where 'digit' is a previously constructed characte class found

	167 in "key_map".'''

160 ranges = []	168 ranges = []

161 assert graph[0] == 'CLASS' or graph[0] == 'NOT_CLASS'	169 assert graph[0] == 'CLASS' or graph[0] == 'NOT_CLASS'

162 invert = graph[0] == 'NOT_CLASS'	170 invert = graph[0] == 'NOT_CLASS'

163 TransitionKey.__process_graph(graph[1], ranges, key_map)	171 TransitionKey.__process_graph(graph[1], ranges, key_map)

164 return TransitionKey.__key_from_ranges(invert, ranges)	172 return TransitionKey.__key_from_ranges(invert, ranges)

165	173

166 def matches_char(self, char):	174 def matches_char(self, char):

167 char = ord(char)	175 char = ord(char)

168 # TODO class checks	176 # TODO class checks

169 for r in self.__ranges:	177 for r in self.__ranges:

170 if r[0] <= char and char <= r[1]: return True	178 if r[0] <= char and char <= r[1]: return True

171 return False	179 return False

172	180

173 def matches_key(self, key):	181 def is_superset_of_key(self, key):

	182 '''Returns true if 'key' is a sub-key of this TransitionKey.'''

174 assert isinstance(key, self.__class__)	183 assert isinstance(key, self.__class__)

175 assert key != TransitionKey.epsilon() and not key.__is_unique()	184 assert key != TransitionKey.epsilon() and not key.__is_unique()

176 assert len(key.__ranges) == 1	185 assert len(key.__ranges) == 1

177 subkey = key.__ranges[0]	186 subkey = key.__ranges[0]

178 matches = False	187 matches = False

179 for k in self.__ranges:	188 for k in self.__ranges:

180 if k[0] <= subkey[0]:	189 if k[0] <= subkey[0]:

181 assert subkey[1] <= k[1] or subkey[0] > k[1]	190 assert subkey[1] <= k[1] or subkey[0] > k[1]

182 if subkey[0] < k[0]:	191 if subkey[0] < k[0]:

183 assert subkey[1] < k[0]	192 assert subkey[1] < k[0]

(...skipping 56 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
240 assert TransitionKey.__in_latin_1(x)	249 assert TransitionKey.__in_latin_1(x)

241 if not x in TransitionKey.__printable_cache:	250 if not x in TransitionKey.__printable_cache:

242 res = "'%s'" % chr(x) if chr(x) in printable else str(x)	251 res = "'%s'" % chr(x) if chr(x) in printable else str(x)

243 TransitionKey.__printable_cache[x] = res	252 TransitionKey.__printable_cache[x] = res

244 return TransitionKey.__printable_cache[x]	253 return TransitionKey.__printable_cache[x]

245 if r[0] == r[1]:	254 if r[0] == r[1]:

246 return '%s' % to_str(r[0])	255 return '%s' % to_str(r[0])

247 else:	256 else:

248 return '[%s-%s]' % (to_str(r[0]), to_str(r[1]))	257 return '[%s-%s]' % (to_str(r[0]), to_str(r[1]))

249	258

	259 def __init__(self, ranges, name = None):

	260 if not ranges:

	261 assert name == 'epsilon'

	262 assert not name in TransitionKey.__cached_keys

	263 else:

	264 TransitionKey.__verify_ranges(ranges, True)

	265 self.__name = name

	266 self.__ranges = tuple(ranges) # immutable

	267 self.__cached_hash = None

	268

250 def __str__(self):	269 def __str__(self):

251 if self.__name:	270 if self.__name:

252 return self.__name	271 return self.__name

253 return ', '.join(TransitionKey.__range_str(x) for x in self.__ranges)	272 return ', '.join(TransitionKey.__range_str(x) for x in self.__ranges)

254	273

255 @staticmethod	274 @staticmethod

256 def __disjoint_keys(range_map):	275 def __disjoint_keys(range_map):

	276 '''Takes a set of possibly overlapping ranges, returns a list of ranges

	277 which don't overlap and which cover the same points as the original

	278 set. range_map is a map from lower bounds to a list of upper bounds.'''

257 sort = lambda x : sorted(set(x))	279 sort = lambda x : sorted(set(x))

258 range_map = sorted(map(lambda (k, v): (k, sort(v)), range_map.items()))	280 range_map = sorted(map(lambda (k, v): (k, sort(v)), range_map.items()))

259 ranges = []	281 ranges = []

260 upper_bound = TransitionKey.__upper_bound + 1	282 upper_bound = TransitionKey.__upper_bound + 1

261 for i in range(len(range_map)):	283 for i in range(len(range_map)):

262 (left, left_values) = range_map[i]	284 (left, left_values) = range_map[i]

263 next = range_map[i + 1][0] if i != len(range_map) - 1 else upper_bound	285 next = range_map[i + 1][0] if i != len(range_map) - 1 else upper_bound

264 to_push = []	286 to_push = []

265 for v in left_values:	287 for v in left_values:

266 assert left <= next	288 assert left <= next

(...skipping 20 matching lines...) Expand all Loading...
287 for r in x.__ranges:	309 for r in x.__ranges:

288 if not r[0] in range_map:	310 if not r[0] in range_map:

289 range_map[r[0]] = []	311 range_map[r[0]] = []

290 range_map[r[0]].append(r[1])	312 range_map[r[0]].append(r[1])

291 ranges = TransitionKey.__disjoint_keys(range_map)	313 ranges = TransitionKey.__disjoint_keys(range_map)

292 TransitionKey.__verify_ranges(ranges, False)	314 TransitionKey.__verify_ranges(ranges, False)

293 return ranges	315 return ranges

294	316

295 @staticmethod	317 @staticmethod

296 def disjoint_keys(key_set):	318 def disjoint_keys(key_set):

	319 '''Takes a set of possibly overlapping TransitionKeys, returns a list of

	320 TransitionKeys which don't overlap and whose union is the same as the union

	321 of the original key_set. key_set is a set of TransitionKeys.'''

297 ranges = TransitionKey.__disjoint_ranges_from_key_set(key_set)	322 ranges = TransitionKey.__disjoint_ranges_from_key_set(key_set)

298 return map(lambda x : TransitionKey.__create([x]), ranges)	323 return map(lambda x : TransitionKey([x]), ranges)

299	324

300 @staticmethod	325 @staticmethod

301 def inverse_key(key_set):	326 def inverse_key(key_set):

	327 '''Returns a TransitionKey which matches represents the inverse of the union

	328 of 'key_set'. The TransitionKeys contain a set of character ranges and a set

	329 of classes. The character ranges are inverted in relation to the latin_1

	330 character range, and the character classes are inverted in relation to all

	331 character classes in __class_bounds.'''

302 ranges = TransitionKey.__disjoint_ranges_from_key_set(key_set)	332 ranges = TransitionKey.__disjoint_ranges_from_key_set(key_set)

303 inverse = TransitionKey.__invert_ranges(ranges)	333 inverse = TransitionKey.__invert_ranges(ranges)

304 if not inverse:	334 if not inverse:

305 return None	335 return None

306 return TransitionKey.__create(inverse)	336 return TransitionKey(inverse)

307	337

308 @staticmethod	338 @staticmethod

309 def __key_from_ranges(invert, ranges):	339 def __key_from_ranges(invert, ranges):

310 range_map = {}	340 range_map = {}

311 for r in ranges:	341 for r in ranges:

312 if not r[0] in range_map:	342 if not r[0] in range_map:

313 range_map[r[0]] = []	343 range_map[r[0]] = []

314 range_map[r[0]].append(r[1])	344 range_map[r[0]].append(r[1])

315 ranges = TransitionKey.__disjoint_keys(range_map)	345 ranges = TransitionKey.__disjoint_keys(range_map)

316 ranges = TransitionKey.__merge_ranges(ranges)	346 ranges = TransitionKey.__merge_ranges(ranges)

317 if invert:	347 if invert:

318 ranges = TransitionKey.__invert_ranges(ranges)	348 ranges = TransitionKey.__invert_ranges(ranges)

319 return TransitionKey.__create(ranges)	349 return TransitionKey(ranges)

320	350

321 @staticmethod	351 @staticmethod

322 def __merge_ranges(ranges):	352 def __merge_ranges(ranges):

323 merged = []	353 merged = []

324 last = None	354 last = None

325 for r in ranges:	355 for r in ranges:

326 assert not TransitionKey.__is_unique_range(r)	356 assert not TransitionKey.__is_unique_range(r)

327 if last == None:	357 if last == None:

328 last = r	358 last = r

329 elif (last[1] + 1 == r[0] and not TransitionKey.__is_class_range(r)):	359 elif (last[1] + 1 == r[0] and not TransitionKey.__is_class_range(r)):

330 last = (last[0], r[1])	360 last = (last[0], r[1])

331 else:	361 else:

332 merged.append(last)	362 merged.append(last)

333 last = r	363 last = r

334 if last != None:	364 if last != None:

335 merged.append(last)	365 merged.append(last)

336 return merged	366 return merged

337	367

338 @staticmethod	368 @staticmethod

339 def merged_key(keys):	369 def merged_key(keys):

340 f = lambda acc, key: acc + list(key.__ranges)	370 f = lambda acc, key: acc + list(key.__ranges)

341 return TransitionKey.__key_from_ranges(False, reduce(f, keys, []))	371 return TransitionKey.__key_from_ranges(False, reduce(f, keys, []))

342	372

343 @staticmethod	373 @staticmethod

344 def __invert_ranges(ranges):	374 def __invert_ranges(ranges):

345 inverted = []	375 inverted = []

346 last = None	376 last = None

	377 # Extract character classes (as opposed to character ranges) from

	378 # __class_bounds. Since latin_1 is the only real character range, we can do

	379 # this.

347 classes = set(TransitionKey.__class_bounds.values())	380 classes = set(TransitionKey.__class_bounds.values())

348 latin_1 = TransitionKey.__class_bounds['latin_1']	381 latin_1 = TransitionKey.__class_bounds['latin_1']

349 classes.remove(latin_1)	382 classes.remove(latin_1)

350 for r in ranges:	383 for r in ranges:

351 assert not TransitionKey.__is_unique_range(r)	384 assert not TransitionKey.__is_unique_range(r)

352 if TransitionKey.__is_class_range(r):	385 if TransitionKey.__is_class_range(r):

353 classes.remove(r)	386 classes.remove(r)

354 continue	387 continue

355 if last == None:	388 if last == None:

356 if r[0] != TransitionKey.__lower_bound:	389 if r[0] != TransitionKey.__lower_bound:

357 inverted.append((TransitionKey.__lower_bound, r[0] - 1))	390 inverted.append((TransitionKey.__lower_bound, r[0] - 1))

358 elif last[1] + 1 < r[0]:	391 elif last[1] + 1 < r[0]:

359 inverted.append((last[1] + 1, r[0] - 1))	392 inverted.append((last[1] + 1, r[0] - 1))

360 last = r	393 last = r

361 upper_bound = latin_1[1]	394 upper_bound = latin_1[1]

362 if last == None:	395 if last == None:

363 inverted.append(latin_1)	396 inverted.append(latin_1)

364 elif last[1] < upper_bound:	397 elif last[1] < upper_bound:

365 inverted.append((last[1] + 1, upper_bound))	398 inverted.append((last[1] + 1, upper_bound))

366 inverted += list(classes)	399 inverted += list(classes)

367 return inverted	400 return inverted

OLD	NEW

« no previous file with comments | « tools/lexer_generator/nfa.py ('k') | no next file » | no next file with comments »