OLD | NEW |
| (Empty) |
1 #!/usr/bin/env python | |
2 # Copyright 2014 The Chromium Authors. All rights reserved. | |
3 # Use of this source code is governed by a BSD-style license that can be | |
4 # found in the LICENSE file. | |
5 | |
6 """ | |
7 A Deterministic acyclic finite state automaton (DAFSA) is a compact | |
8 representation of an unordered word list (dictionary). | |
9 | |
10 http://en.wikipedia.org/wiki/Deterministic_acyclic_finite_state_automaton | |
11 | |
12 This python program converts a list of strings to a byte array in C++. | |
13 This python program fetches strings and return values from a gperf file | |
14 and generates a C++ file with a byte array representing graph that can be | |
15 used as a memory efficient replacement for the perfect hash table. | |
16 | |
17 The input strings are assumed to consist of printable 7-bit ASCII characters | |
18 and the return values are assumed to be one digit integers. | |
19 | |
20 In this program a DAFSA is a diamond shaped graph starting at a common | |
21 source node and ending at a common sink node. All internal nodes contain | |
22 a label and each word is represented by the labels in one path from | |
23 the source node to the sink node. | |
24 | |
25 The following python represention is used for nodes: | |
26 | |
27 Source node: [ children ] | |
28 Internal node: (label, [ children ]) | |
29 Sink node: None | |
30 | |
31 The graph is first compressed by prefixes like a trie. In the next step | |
32 suffixes are compressed so that the graph gets diamond shaped. Finally | |
33 one to one linked nodes are replaced by nodes with the labels joined. | |
34 | |
35 The order of the operations is crucial since lookups will be performed | |
36 starting from the source with no backtracking. Thus a node must have at | |
37 most one child with a label starting by the same character. The output | |
38 is also arranged so that all jumps are to increasing addresses, thus forward | |
39 in memory. | |
40 | |
41 The generated output has suffix free decoding so that the sign of leading | |
42 bits in a link (a reference to a child node) indicate if it has a size of one, | |
43 two or three bytes and if it is the last outgoing link from the actual node. | |
44 A node label is terminated by a byte with the leading bit set. | |
45 | |
46 The generated byte array can described by the following BNF: | |
47 | |
48 <byte> ::= < 8-bit value in range [0x00-0xFF] > | |
49 | |
50 <char> ::= < printable 7-bit ASCII character, byte in range [0x20-0x7F] > | |
51 <end_char> ::= < char + 0x80, byte in range [0xA0-0xFF] > | |
52 <return value> ::= < value + 0x80, byte in range [0x80-0x8F] > | |
53 | |
54 <offset1> ::= < byte in range [0x00-0x3F] > | |
55 <offset2> ::= < byte in range [0x40-0x5F] > | |
56 <offset3> ::= < byte in range [0x60-0x7F] > | |
57 | |
58 <end_offset1> ::= < byte in range [0x80-0xBF] > | |
59 <end_offset2> ::= < byte in range [0xC0-0xDF] > | |
60 <end_offset3> ::= < byte in range [0xE0-0xFF] > | |
61 | |
62 <prefix> ::= <char> | |
63 | |
64 <label> ::= <end_char> | |
65 | <char> <label> | |
66 | |
67 <end_label> ::= <return_value> | |
68 | <char> <end_label> | |
69 | |
70 <offset> ::= <offset1> | |
71 | <offset2> <byte> | |
72 | <offset3> <byte> <byte> | |
73 | |
74 <end_offset> ::= <end_offset1> | |
75 | <end_offset2> <byte> | |
76 | <end_offset3> <byte> <byte> | |
77 | |
78 <offsets> ::= <end_offset> | |
79 | <offset> <offsets> | |
80 | |
81 <source> ::= <offsets> | |
82 | |
83 <node> ::= <label> <offsets> | |
84 | <prefix> <node> | |
85 | <end_label> | |
86 | |
87 <dafsa> ::= <source> | |
88 | <dafsa> <node> | |
89 | |
90 Decoding: | |
91 | |
92 <char> -> printable 7-bit ASCII character | |
93 <end_char> & 0x7F -> printable 7-bit ASCII character | |
94 <return value> & 0x0F -> integer | |
95 <offset1 & 0x3F> -> integer | |
96 ((<offset2> & 0x1F>) << 8) + <byte> -> integer | |
97 ((<offset3> & 0x1F>) << 16) + (<byte> << 8) + <byte> -> integer | |
98 | |
99 end_offset1, end_offset2 and and_offset3 are decoded same as offset1, | |
100 offset2 and offset3 respectively. | |
101 | |
102 The first offset in a list of offsets is the distance in bytes between the | |
103 offset itself and the first child node. Subsequent offsets are the distance | |
104 between previous child node and next child node. Thus each offset links a node | |
105 to a child node. The distance is always counted between start addresses, i.e. | |
106 first byte in decoded offset or first byte in child node. | |
107 | |
108 Example 1: | |
109 | |
110 %% | |
111 aa, 1 | |
112 a, 2 | |
113 %% | |
114 | |
115 The input is first parsed to a list of words: | |
116 ["aa1", "a2"] | |
117 | |
118 A fully expanded graph is created from the words: | |
119 source = [node1, node4] | |
120 node1 = ("a", [node2]) | |
121 node2 = ("a", [node3]) | |
122 node3 = ("\x01", [sink]) | |
123 node4 = ("a", [node5]) | |
124 node5 = ("\x02", [sink]) | |
125 sink = None | |
126 | |
127 Compression results in the following graph: | |
128 source = [node1] | |
129 node1 = ("a", [node2, node3]) | |
130 node2 = ("\x02", [sink]) | |
131 node3 = ("a\x01", [sink]) | |
132 sink = None | |
133 | |
134 A C++ representation of the compressed graph is generated: | |
135 | |
136 const unsigned char dafsa[7] = { | |
137 0x81, 0xE1, 0x02, 0x81, 0x82, 0x61, 0x81, | |
138 }; | |
139 | |
140 The bytes in the generated array has the following meaning: | |
141 | |
142 0: 0x81 <end_offset1> child at position 0 + (0x81 & 0x3F) -> jump to 1 | |
143 | |
144 1: 0xE1 <end_char> label character (0xE1 & 0x7F) -> match "a" | |
145 2: 0x02 <offset1> child at position 2 + (0x02 & 0x3F) -> jump to 4 | |
146 | |
147 3: 0x81 <end_offset1> child at position 4 + (0x81 & 0x3F) -> jump to 5 | |
148 4: 0x82 <return_value> 0x82 & 0x0F -> return 2 | |
149 | |
150 5: 0x61 <char> label character 0x61 -> match "a" | |
151 6: 0x81 <return_value> 0x81 & 0x0F -> return 1 | |
152 | |
153 Example 2: | |
154 | |
155 %% | |
156 aa, 1 | |
157 bbb, 2 | |
158 baa, 1 | |
159 %% | |
160 | |
161 The input is first parsed to a list of words: | |
162 ["aa1", "bbb2", "baa1"] | |
163 | |
164 Compression results in the following graph: | |
165 source = [node1, node2] | |
166 node1 = ("b", [node2, node3]) | |
167 node2 = ("aa\x01", [sink]) | |
168 node3 = ("bb\x02", [sink]) | |
169 sink = None | |
170 | |
171 A C++ representation of the compressed graph is generated: | |
172 | |
173 const unsigned char dafsa[11] = { | |
174 0x02, 0x83, 0xE2, 0x02, 0x83, 0x61, 0x61, 0x81, 0x62, 0x62, 0x82, | |
175 }; | |
176 | |
177 The bytes in the generated array has the following meaning: | |
178 | |
179 0: 0x02 <offset1> child at position 0 + (0x02 & 0x3F) -> jump to 2 | |
180 1: 0x83 <end_offset1> child at position 2 + (0x83 & 0x3F) -> jump to 5 | |
181 | |
182 2: 0xE2 <end_char> label character (0xE2 & 0x7F) -> match "b" | |
183 3: 0x02 <offset1> child at position 3 + (0x02 & 0x3F) -> jump to 5 | |
184 4: 0x83 <end_offset1> child at position 5 + (0x83 & 0x3F) -> jump to 8 | |
185 | |
186 5: 0x61 <char> label character 0x61 -> match "a" | |
187 6: 0x61 <char> label character 0x61 -> match "a" | |
188 7: 0x81 <return_value> 0x81 & 0x0F -> return 1 | |
189 | |
190 8: 0x62 <char> label character 0x62 -> match "b" | |
191 9: 0x62 <char> label character 0x62 -> match "b" | |
192 10: 0x82 <return_value> 0x82 & 0x0F -> return 2 | |
193 """ | |
194 | |
195 import sys | |
196 | |
197 class InputError(Exception): | |
198 """Exception raised for errors in the input file.""" | |
199 | |
200 | |
201 def to_dafsa(words): | |
202 """Generates a DAFSA from a word list and returns the source node. | |
203 | |
204 Each word is split into characters so that each character is represented by | |
205 a unique node. It is assumed the word list is not empty. | |
206 """ | |
207 if not words: | |
208 raise InputError('The domain list must not be empty') | |
209 def ToNodes(word): | |
210 """Split words into characters""" | |
211 if not 0x1F < ord(word[0]) < 0x80: | |
212 raise InputError('Domain names must be printable 7-bit ASCII') | |
213 if len(word) == 1: | |
214 return chr(ord(word[0]) & 0x0F), [None] | |
215 return word[0], [ToNodes(word[1:])] | |
216 return [ToNodes(word) for word in words] | |
217 | |
218 | |
219 def to_words(node): | |
220 """Generates a word list from all paths starting from an internal node.""" | |
221 if not node: | |
222 return [''] | |
223 return [(node[0] + word) for child in node[1] for word in to_words(child)] | |
224 | |
225 | |
226 def reverse(dafsa): | |
227 """Generates a new DAFSA that is reversed, so that the old sink node becomes | |
228 the new source node. | |
229 """ | |
230 sink = [] | |
231 nodemap = {} | |
232 | |
233 def dfs(node, parent): | |
234 """Creates reverse nodes. | |
235 | |
236 A new reverse node will be created for each old node. The new node will | |
237 get a reversed label and the parents of the old node as children. | |
238 """ | |
239 if not node: | |
240 sink.append(parent) | |
241 elif id(node) not in nodemap: | |
242 nodemap[id(node)] = (node[0][::-1], [parent]) | |
243 for child in node[1]: | |
244 dfs(child, nodemap[id(node)]) | |
245 else: | |
246 nodemap[id(node)][1].append(parent) | |
247 | |
248 for node in dafsa: | |
249 dfs(node, None) | |
250 return sink | |
251 | |
252 | |
253 def join_labels(dafsa): | |
254 """Generates a new DAFSA where internal nodes are merged if there is a one to | |
255 one connection. | |
256 """ | |
257 parentcount = { id(None): 2 } | |
258 nodemap = { id(None): None } | |
259 | |
260 def count_parents(node): | |
261 """Count incoming references""" | |
262 if id(node) in parentcount: | |
263 parentcount[id(node)] += 1 | |
264 else: | |
265 parentcount[id(node)] = 1 | |
266 for child in node[1]: | |
267 count_parents(child) | |
268 | |
269 def join(node): | |
270 """Create new nodes""" | |
271 if id(node) not in nodemap: | |
272 children = [join(child) for child in node[1]] | |
273 if len(children) == 1 and parentcount[id(node[1][0])] == 1: | |
274 child = children[0] | |
275 nodemap[id(node)] = (node[0] + child[0], child[1]) | |
276 else: | |
277 nodemap[id(node)] = (node[0], children) | |
278 return nodemap[id(node)] | |
279 | |
280 for node in dafsa: | |
281 count_parents(node) | |
282 return [join(node) for node in dafsa] | |
283 | |
284 | |
285 def join_suffixes(dafsa): | |
286 """Generates a new DAFSA where nodes that represent the same word lists | |
287 towards the sink are merged. | |
288 """ | |
289 nodemap = { frozenset(('',)): None } | |
290 | |
291 def join(node): | |
292 """Returns a macthing node. A new node is created if no matching node | |
293 exists. The graph is accessed in dfs order. | |
294 """ | |
295 suffixes = frozenset(to_words(node)) | |
296 if suffixes not in nodemap: | |
297 nodemap[suffixes] = (node[0], [join(child) for child in node[1]]) | |
298 return nodemap[suffixes] | |
299 | |
300 return [join(node) for node in dafsa] | |
301 | |
302 | |
303 def top_sort(dafsa): | |
304 """Generates list of nodes in topological sort order.""" | |
305 incoming = {} | |
306 | |
307 def count_incoming(node): | |
308 """Counts incoming references.""" | |
309 if node: | |
310 if id(node) not in incoming: | |
311 incoming[id(node)] = 1 | |
312 for child in node[1]: | |
313 count_incoming(child) | |
314 else: | |
315 incoming[id(node)] += 1 | |
316 | |
317 for node in dafsa: | |
318 count_incoming(node) | |
319 | |
320 for node in dafsa: | |
321 incoming[id(node)] -= 1 | |
322 | |
323 waiting = [node for node in dafsa if incoming[id(node)] == 0] | |
324 nodes = [] | |
325 | |
326 while waiting: | |
327 node = waiting.pop() | |
328 assert incoming[id(node)] == 0 | |
329 nodes.append(node) | |
330 for child in node[1]: | |
331 if child: | |
332 incoming[id(child)] -= 1 | |
333 if incoming[id(child)] == 0: | |
334 waiting.append(child) | |
335 return nodes | |
336 | |
337 | |
338 def encode_links(children, offsets, current): | |
339 """Encodes a list of children as one, two or three byte offsets.""" | |
340 if not children[0]: | |
341 # This is an <end_label> node and no links follow such nodes | |
342 assert len(children) == 1 | |
343 return [] | |
344 guess = 3 * len(children) | |
345 assert children | |
346 children = sorted(children, key = lambda x: -offsets[id(x)]) | |
347 while True: | |
348 offset = current + guess | |
349 buf = [] | |
350 for child in children: | |
351 last = len(buf) | |
352 distance = offset - offsets[id(child)] | |
353 assert distance > 0 and distance < (1 << 21) | |
354 | |
355 if distance < (1 << 6): | |
356 # A 6-bit offset: "s0xxxxxx" | |
357 buf.append(distance) | |
358 elif distance < (1 << 13): | |
359 # A 13-bit offset: "s10xxxxxxxxxxxxx" | |
360 buf.append(0x40 | (distance >> 8)) | |
361 buf.append(distance & 0xFF) | |
362 else: | |
363 # A 21-bit offset: "s11xxxxxxxxxxxxxxxxxxxxx" | |
364 buf.append(0x60 | (distance >> 16)) | |
365 buf.append((distance >> 8) & 0xFF) | |
366 buf.append(distance & 0xFF) | |
367 # Distance in first link is relative to following record. | |
368 # Distance in other links are relative to previous link. | |
369 offset -= distance | |
370 if len(buf) == guess: | |
371 break | |
372 guess = len(buf) | |
373 # Set most significant bit to mark end of links in this node. | |
374 buf[last] |= (1 << 7) | |
375 buf.reverse() | |
376 return buf | |
377 | |
378 | |
379 def encode_prefix(label): | |
380 """Encodes a node label as a list of bytes without a trailing high byte. | |
381 | |
382 This method encodes a node if there is exactly one child and the | |
383 child follows immidiately after so that no jump is needed. This label | |
384 will then be a prefix to the label in the child node. | |
385 """ | |
386 assert label | |
387 return [ord(c) for c in reversed(label)] | |
388 | |
389 | |
390 def encode_label(label): | |
391 """Encodes a node label as a list of bytes with a trailing high byte >0x80. | |
392 """ | |
393 buf = encode_prefix(label) | |
394 # Set most significant bit to mark end of label in this node. | |
395 buf[0] |= (1 << 7) | |
396 return buf | |
397 | |
398 | |
399 def encode(dafsa): | |
400 """Encodes a DAFSA to a list of bytes""" | |
401 output = [] | |
402 offsets = {} | |
403 | |
404 for node in reversed(top_sort(dafsa)): | |
405 if (len(node[1]) == 1 and node[1][0] and | |
406 (offsets[id(node[1][0])] == len(output))): | |
407 output.extend(encode_prefix(node[0])) | |
408 else: | |
409 output.extend(encode_links(node[1], offsets, len(output))) | |
410 output.extend(encode_label(node[0])) | |
411 offsets[id(node)] = len(output) | |
412 | |
413 output.extend(encode_links(dafsa, offsets, len(output))) | |
414 output.reverse() | |
415 return output | |
416 | |
417 | |
418 def to_cxx(data): | |
419 """Generates C++ code from a list of encoded bytes.""" | |
420 text = '/* This file is generated. DO NOT EDIT!\n\n' | |
421 text += 'The byte array encodes effective tld names. See make_dafsa.py for' | |
422 text += ' documentation.' | |
423 text += '*/\n\n' | |
424 text += 'const unsigned char kDafsa[%s] = {\n' % len(data) | |
425 for i in range(0, len(data), 12): | |
426 text += ' ' | |
427 text += ', '.join('0x%02x' % byte for byte in data[i:i + 12]) | |
428 text += ',\n' | |
429 text += '};\n' | |
430 return text | |
431 | |
432 | |
433 def words_to_cxx(words): | |
434 """Generates C++ code from a word list""" | |
435 dafsa = to_dafsa(words) | |
436 for fun in (reverse, join_suffixes, reverse, join_suffixes, join_labels): | |
437 dafsa = fun(dafsa) | |
438 return to_cxx(encode(dafsa)) | |
439 | |
440 | |
441 def parse_gperf(infile): | |
442 """Parses gperf file and extract strings and return code""" | |
443 lines = [line.strip() for line in infile] | |
444 # Extract strings after the first '%%' and before the second '%%'. | |
445 begin = lines.index('%%') + 1 | |
446 end = lines.index('%%', begin) | |
447 lines = lines[begin:end] | |
448 for line in lines: | |
449 if line[-3:-1] != ', ': | |
450 raise InputError('Expected "domainname, <digit>", found "%s"' % line) | |
451 # Technically the DAFSA format can support return values in the range | |
452 # [0-31], but only the first three bits have any defined meaning. | |
453 if not line.endswith(('0', '1', '2', '3', '4', '5', '6', '7')): | |
454 raise InputError('Expected value to be in the range of 0-7, found "%s"' % | |
455 line[-1]) | |
456 return [line[:-3] + line[-1] for line in lines] | |
457 | |
458 | |
459 def main(): | |
460 if len(sys.argv) != 3: | |
461 print('usage: %s infile outfile' % sys.argv[0]) | |
462 return 1 | |
463 with open(sys.argv[1], 'r') as infile, open(sys.argv[2], 'w') as outfile: | |
464 outfile.write(words_to_cxx(parse_gperf(infile))) | |
465 return 0 | |
466 | |
467 | |
468 if __name__ == '__main__': | |
469 sys.exit(main()) | |
OLD | NEW |