OLD | NEW |
(Empty) | |
| 1 #!/usr/bin/env python |
| 2 # Copyright 2014 The Chromium Authors. All rights reserved. |
| 3 # Use of this source code is governed by a BSD-style license that can be |
| 4 # found in the LICENSE file. |
| 5 |
| 6 """ |
| 7 A Deterministic acyclic finite state automaton (DAFSA) is a compact |
| 8 representation of an unordered word list (dictionary). |
| 9 |
| 10 http://en.wikipedia.org/wiki/Deterministic_acyclic_finite_state_automaton |
| 11 |
| 12 This python program converts a list of strings to a byte array in C++. |
| 13 This python program fetches strings and return values from a gperf file |
| 14 and generates a C++ file with a byte array representing graph that can be |
| 15 used as a memory efficient replacement for the perfect hash table. |
| 16 |
| 17 The input strings are assumed to consist of printable 7-bit ASCII characters |
| 18 and the return values are assumed to be one digit integers. |
| 19 |
| 20 In this program a DAFSA is a diamond shaped graph starting at a common |
| 21 source node and ending at a common sink node. All internal nodes contain |
| 22 a label and each word is represented by the labels in one path from |
| 23 the source node to the sink node. |
| 24 |
| 25 The following python represention is used for nodes: |
| 26 |
| 27 Source node: [ children ] |
| 28 Internal node: (label, [ children ]) |
| 29 Sink node: None |
| 30 |
| 31 The graph is first compressed by prefixes like a trie. In the next step |
| 32 suffixes are compressed so that the graph gets diamond shaped. Finally |
| 33 one to one linked nodes are replaced by nodes with the labels joined. |
| 34 |
| 35 The order of the operations is crucial since lookups will be performed |
| 36 starting from the source with no backtracking. Thus a node must have at |
| 37 most one child with a label starting by the same character. The output |
| 38 is also arranged so that all jumps are to increasing addresses, thus forward |
| 39 in memory. |
| 40 |
| 41 The generated output has suffix free decoding so that the sign of leading |
| 42 bits in a link (a reference to a child node) indicate if it has a size of one, |
| 43 two or three bytes and if it is the last outgoing link from the actual node. |
| 44 A node label is terminated by a byte with the leading bit set. |
| 45 |
| 46 The generated byte array can described by the following BNF: |
| 47 |
| 48 <byte> ::= < 8-bit value in range [0x00-0xFF] > |
| 49 |
| 50 <char> ::= < printable 7-bit ASCII character, byte in range [0x20-0x7F] > |
| 51 <end_char> ::= < char + 0x80, byte in range [0xA0-0xFF] > |
| 52 <return value> ::= < value + 0x80, byte in range [0x80-0x8F] > |
| 53 |
| 54 <offset1> ::= < byte in range [0x00-0x3F] > |
| 55 <offset2> ::= < byte in range [0x40-0x5F] > |
| 56 <offset3> ::= < byte in range [0x60-0x7F] > |
| 57 |
| 58 <end_offset1> ::= < byte in range [0x80-0xBF] > |
| 59 <end_offset2> ::= < byte in range [0xC0-0xDF] > |
| 60 <end_offset3> ::= < byte in range [0xE0-0xFF] > |
| 61 |
| 62 <prefix> ::= <char> |
| 63 |
| 64 <label> ::= <end_char> |
| 65 | <char> <label> |
| 66 |
| 67 <end_label> ::= <return_value> |
| 68 | <char> <end_label> |
| 69 |
| 70 <offset> ::= <offset1> |
| 71 | <offset2> <byte> |
| 72 | <offset3> <byte> <byte> |
| 73 |
| 74 <end_offset> ::= <end_offset1> |
| 75 | <end_offset2> <byte> |
| 76 | <end_offset3> <byte> <byte> |
| 77 |
| 78 <offsets> ::= <end_offset> |
| 79 | <offset> <offsets> |
| 80 |
| 81 <source> ::= <offsets> |
| 82 |
| 83 <node> ::= <label> <offsets> |
| 84 | <prefix> <node> |
| 85 | <end_label> |
| 86 |
| 87 <dafsa> ::= <source> |
| 88 | <dafsa> <node> |
| 89 |
| 90 Decoding: |
| 91 |
| 92 <char> -> printable 7-bit ASCII character |
| 93 <end_char> & 0x7F -> printable 7-bit ASCII character |
| 94 <return value> & 0x0F -> integer |
| 95 <offset1 & 0x3F> -> integer |
| 96 ((<offset2> & 0x1F>) << 8) + <byte> -> integer |
| 97 ((<offset3> & 0x1F>) << 16) + (<byte> << 8) + <byte> -> integer |
| 98 |
| 99 end_offset1, end_offset2 and and_offset3 are decoded same as offset1, |
| 100 offset2 and offset3 respectively. |
| 101 |
| 102 The first offset in a list of offsets is the distance in bytes between the |
| 103 offset itself and the first child node. Subsequent offsets are the distance |
| 104 between previous child node and next child node. Thus each offset links a node |
| 105 to a child node. The distance is always counted between start addresses, i.e. |
| 106 first byte in decoded offset or first byte in child node. |
| 107 |
| 108 Example 1: |
| 109 |
| 110 %% |
| 111 aa, 1 |
| 112 a, 2 |
| 113 %% |
| 114 |
| 115 The input is first parsed to a list of words: |
| 116 ["aa1", "a2"] |
| 117 |
| 118 A fully expanded graph is created from the words: |
| 119 source = [node1, node4] |
| 120 node1 = ("a", [node2]) |
| 121 node2 = ("a", [node3]) |
| 122 node3 = ("\x01", [sink]) |
| 123 node4 = ("a", [node5]) |
| 124 node5 = ("\x02", [sink]) |
| 125 sink = None |
| 126 |
| 127 Compression results in the following graph: |
| 128 source = [node1] |
| 129 node1 = ("a", [node2, node3]) |
| 130 node2 = ("\x02", [sink]) |
| 131 node3 = ("a\x01", [sink]) |
| 132 sink = None |
| 133 |
| 134 A C++ representation of the compressed graph is generated: |
| 135 |
| 136 const unsigned char dafsa[7] = { |
| 137 0x81, 0xE1, 0x02, 0x81, 0x82, 0x61, 0x81, |
| 138 }; |
| 139 |
| 140 The bytes in the generated array has the following meaning: |
| 141 |
| 142 0: 0x81 <end_offset1> child at position 0 + (0x81 & 0x3F) -> jump to 1 |
| 143 |
| 144 1: 0xE1 <end_char> label character (0xE1 & 0x7F) -> match "a" |
| 145 2: 0x02 <offset1> child at position 2 + (0x02 & 0x3F) -> jump to 4 |
| 146 |
| 147 3: 0x81 <end_offset1> child at position 4 + (0x81 & 0x3F) -> jump to 5 |
| 148 4: 0x82 <return_value> 0x82 & 0x0F -> return 2 |
| 149 |
| 150 5: 0x61 <char> label character 0x61 -> match "a" |
| 151 6: 0x81 <return_value> 0x81 & 0x0F -> return 1 |
| 152 |
| 153 Example 2: |
| 154 |
| 155 %% |
| 156 aa, 1 |
| 157 bbb, 2 |
| 158 baa, 1 |
| 159 %% |
| 160 |
| 161 The input is first parsed to a list of words: |
| 162 ["aa1", "bbb2", "baa1"] |
| 163 |
| 164 Compression results in the following graph: |
| 165 source = [node1, node2] |
| 166 node1 = ("b", [node2, node3]) |
| 167 node2 = ("aa\x01", [sink]) |
| 168 node3 = ("bb\x02", [sink]) |
| 169 sink = None |
| 170 |
| 171 A C++ representation of the compressed graph is generated: |
| 172 |
| 173 const unsigned char dafsa[11] = { |
| 174 0x02, 0x83, 0xE2, 0x02, 0x83, 0x61, 0x61, 0x81, 0x62, 0x62, 0x82, |
| 175 }; |
| 176 |
| 177 The bytes in the generated array has the following meaning: |
| 178 |
| 179 0: 0x02 <offset1> child at position 0 + (0x02 & 0x3F) -> jump to 2 |
| 180 1: 0x83 <end_offset1> child at position 2 + (0x83 & 0x3F) -> jump to 5 |
| 181 |
| 182 2: 0xE2 <end_char> label character (0xE2 & 0x7F) -> match "b" |
| 183 3: 0x02 <offset1> child at position 3 + (0x02 & 0x3F) -> jump to 5 |
| 184 4: 0x83 <end_offset1> child at position 5 + (0x83 & 0x3F) -> jump to 8 |
| 185 |
| 186 5: 0x61 <char> label character 0x61 -> match "a" |
| 187 6: 0x61 <char> label character 0x61 -> match "a" |
| 188 7: 0x81 <return_value> 0x81 & 0x0F -> return 1 |
| 189 |
| 190 8: 0x62 <char> label character 0x62 -> match "b" |
| 191 9: 0x62 <char> label character 0x62 -> match "b" |
| 192 10: 0x82 <return_value> 0x82 & 0x0F -> return 2 |
| 193 """ |
| 194 |
| 195 import sys |
| 196 |
| 197 class InputError(Exception): |
| 198 """Exception raised for errors in the input file.""" |
| 199 |
| 200 |
| 201 def to_dafsa(words): |
| 202 """Generates a DAFSA from a word list and returns the source node. |
| 203 |
| 204 Each word is split into characters so that each character is represented by |
| 205 a unique node. It is assumed the word list is not empty. |
| 206 """ |
| 207 if not words: |
| 208 raise InputError('The domain list must not be empty') |
| 209 def ToNodes(word): |
| 210 """Split words into characters""" |
| 211 if not 0x1F < ord(word[0]) < 0x80: |
| 212 raise InputError('Domain names must be printable 7-bit ASCII') |
| 213 if len(word) == 1: |
| 214 return chr(ord(word[0]) & 0x0F), [None] |
| 215 return word[0], [ToNodes(word[1:])] |
| 216 return [ToNodes(word) for word in words] |
| 217 |
| 218 |
| 219 def to_words(node): |
| 220 """Generates a word list from all paths starting from an internal node.""" |
| 221 if not node: |
| 222 return [''] |
| 223 return [(node[0] + word) for child in node[1] for word in to_words(child)] |
| 224 |
| 225 |
| 226 def reverse(dafsa): |
| 227 """Generates a new DAFSA that is reversed, so that the old sink node becomes |
| 228 the new source node. |
| 229 """ |
| 230 sink = [] |
| 231 nodemap = {} |
| 232 |
| 233 def dfs(node, parent): |
| 234 """Creates reverse nodes. |
| 235 |
| 236 A new reverse node will be created for each old node. The new node will |
| 237 get a reversed label and the parents of the old node as children. |
| 238 """ |
| 239 if not node: |
| 240 sink.append(parent) |
| 241 elif id(node) not in nodemap: |
| 242 nodemap[id(node)] = (node[0][::-1], [parent]) |
| 243 for child in node[1]: |
| 244 dfs(child, nodemap[id(node)]) |
| 245 else: |
| 246 nodemap[id(node)][1].append(parent) |
| 247 |
| 248 for node in dafsa: |
| 249 dfs(node, None) |
| 250 return sink |
| 251 |
| 252 |
| 253 def join_labels(dafsa): |
| 254 """Generates a new DAFSA where internal nodes are merged if there is a one to |
| 255 one connection. |
| 256 """ |
| 257 parentcount = { id(None): 2 } |
| 258 nodemap = { id(None): None } |
| 259 |
| 260 def count_parents(node): |
| 261 """Count incoming references""" |
| 262 if id(node) in parentcount: |
| 263 parentcount[id(node)] += 1 |
| 264 else: |
| 265 parentcount[id(node)] = 1 |
| 266 for child in node[1]: |
| 267 count_parents(child) |
| 268 |
| 269 def join(node): |
| 270 """Create new nodes""" |
| 271 if id(node) not in nodemap: |
| 272 children = [join(child) for child in node[1]] |
| 273 if len(children) == 1 and parentcount[id(node[1][0])] == 1: |
| 274 child = children[0] |
| 275 nodemap[id(node)] = (node[0] + child[0], child[1]) |
| 276 else: |
| 277 nodemap[id(node)] = (node[0], children) |
| 278 return nodemap[id(node)] |
| 279 |
| 280 for node in dafsa: |
| 281 count_parents(node) |
| 282 return [join(node) for node in dafsa] |
| 283 |
| 284 |
| 285 def join_suffixes(dafsa): |
| 286 """Generates a new DAFSA where nodes that represent the same word lists |
| 287 towards the sink are merged. |
| 288 """ |
| 289 nodemap = { frozenset(('',)): None } |
| 290 |
| 291 def join(node): |
| 292 """Returns a macthing node. A new node is created if no matching node |
| 293 exists. The graph is accessed in dfs order. |
| 294 """ |
| 295 suffixes = frozenset(to_words(node)) |
| 296 if suffixes not in nodemap: |
| 297 nodemap[suffixes] = (node[0], [join(child) for child in node[1]]) |
| 298 return nodemap[suffixes] |
| 299 |
| 300 return [join(node) for node in dafsa] |
| 301 |
| 302 |
| 303 def top_sort(dafsa): |
| 304 """Generates list of nodes in topological sort order.""" |
| 305 incoming = {} |
| 306 |
| 307 def count_incoming(node): |
| 308 """Counts incoming references.""" |
| 309 if node: |
| 310 if id(node) not in incoming: |
| 311 incoming[id(node)] = 1 |
| 312 for child in node[1]: |
| 313 count_incoming(child) |
| 314 else: |
| 315 incoming[id(node)] += 1 |
| 316 |
| 317 for node in dafsa: |
| 318 count_incoming(node) |
| 319 |
| 320 for node in dafsa: |
| 321 incoming[id(node)] -= 1 |
| 322 |
| 323 waiting = [node for node in dafsa if incoming[id(node)] == 0] |
| 324 nodes = [] |
| 325 |
| 326 while waiting: |
| 327 node = waiting.pop() |
| 328 assert incoming[id(node)] == 0 |
| 329 nodes.append(node) |
| 330 for child in node[1]: |
| 331 if child: |
| 332 incoming[id(child)] -= 1 |
| 333 if incoming[id(child)] == 0: |
| 334 waiting.append(child) |
| 335 return nodes |
| 336 |
| 337 |
| 338 def encode_links(children, offsets, current): |
| 339 """Encodes a list of children as one, two or three byte offsets.""" |
| 340 if not children[0]: |
| 341 # This is an <end_label> node and no links follow such nodes |
| 342 assert len(children) == 1 |
| 343 return [] |
| 344 guess = 3 * len(children) |
| 345 assert children |
| 346 children = sorted(children, key = lambda x: -offsets[id(x)]) |
| 347 while True: |
| 348 offset = current + guess |
| 349 buf = [] |
| 350 for child in children: |
| 351 last = len(buf) |
| 352 distance = offset - offsets[id(child)] |
| 353 assert distance > 0 and distance < (1 << 21) |
| 354 |
| 355 if distance < (1 << 6): |
| 356 # A 6-bit offset: "s0xxxxxx" |
| 357 buf.append(distance) |
| 358 elif distance < (1 << 13): |
| 359 # A 13-bit offset: "s10xxxxxxxxxxxxx" |
| 360 buf.append(0x40 | (distance >> 8)) |
| 361 buf.append(distance & 0xFF) |
| 362 else: |
| 363 # A 21-bit offset: "s11xxxxxxxxxxxxxxxxxxxxx" |
| 364 buf.append(0x60 | (distance >> 16)) |
| 365 buf.append((distance >> 8) & 0xFF) |
| 366 buf.append(distance & 0xFF) |
| 367 # Distance in first link is relative to following record. |
| 368 # Distance in other links are relative to previous link. |
| 369 offset -= distance |
| 370 if len(buf) == guess: |
| 371 break |
| 372 guess = len(buf) |
| 373 # Set most significant bit to mark end of links in this node. |
| 374 buf[last] |= (1 << 7) |
| 375 buf.reverse() |
| 376 return buf |
| 377 |
| 378 |
| 379 def encode_prefix(label): |
| 380 """Encodes a node label as a list of bytes without a trailing high byte. |
| 381 |
| 382 This method encodes a node if there is exactly one child and the |
| 383 child follows immidiately after so that no jump is needed. This label |
| 384 will then be a prefix to the label in the child node. |
| 385 """ |
| 386 assert label |
| 387 return [ord(c) for c in reversed(label)] |
| 388 |
| 389 |
| 390 def encode_label(label): |
| 391 """Encodes a node label as a list of bytes with a trailing high byte >0x80. |
| 392 """ |
| 393 buf = encode_prefix(label) |
| 394 # Set most significant bit to mark end of label in this node. |
| 395 buf[0] |= (1 << 7) |
| 396 return buf |
| 397 |
| 398 |
| 399 def encode(dafsa): |
| 400 """Encodes a DAFSA to a list of bytes""" |
| 401 output = [] |
| 402 offsets = {} |
| 403 |
| 404 for node in reversed(top_sort(dafsa)): |
| 405 if (len(node[1]) == 1 and node[1][0] and |
| 406 (offsets[id(node[1][0])] == len(output))): |
| 407 output.extend(encode_prefix(node[0])) |
| 408 else: |
| 409 output.extend(encode_links(node[1], offsets, len(output))) |
| 410 output.extend(encode_label(node[0])) |
| 411 offsets[id(node)] = len(output) |
| 412 |
| 413 output.extend(encode_links(dafsa, offsets, len(output))) |
| 414 output.reverse() |
| 415 return output |
| 416 |
| 417 |
| 418 def to_cxx(data): |
| 419 """Generates C++ code from a list of encoded bytes.""" |
| 420 text = '/* This file is generated. DO NOT EDIT!\n\n' |
| 421 text += 'The byte array encodes effective tld names. See make_dafsa.py for' |
| 422 text += ' documentation.' |
| 423 text += '*/\n\n' |
| 424 text += 'const unsigned char kDafsa[%s] = {\n' % len(data) |
| 425 for i in range(0, len(data), 12): |
| 426 text += ' ' |
| 427 text += ', '.join('0x%02x' % byte for byte in data[i:i + 12]) |
| 428 text += ',\n' |
| 429 text += '};\n' |
| 430 return text |
| 431 |
| 432 |
| 433 def words_to_cxx(words): |
| 434 """Generates C++ code from a word list""" |
| 435 dafsa = to_dafsa(words) |
| 436 for fun in (reverse, join_suffixes, reverse, join_suffixes, join_labels): |
| 437 dafsa = fun(dafsa) |
| 438 return to_cxx(encode(dafsa)) |
| 439 |
| 440 |
| 441 def parse_gperf(infile): |
| 442 """Parses gperf file and extract strings and return code""" |
| 443 lines = [line.strip() for line in infile] |
| 444 # Extract strings after the first '%%' and before the second '%%'. |
| 445 begin = lines.index('%%') + 1 |
| 446 end = lines.index('%%', begin) |
| 447 lines = lines[begin:end] |
| 448 for line in lines: |
| 449 if line[-3:-1] != ', ': |
| 450 raise InputError('Expected "domainname, <digit>", found "%s"' % line) |
| 451 # Technically the DAFSA format could support return values in range [0-31], |
| 452 # but the values below are the only with a defined meaning. |
| 453 if line[-1] not in '0124': |
| 454 raise InputError('Expected value to be one of {0,1,2,4}, found "%s"' % |
| 455 line[-1]) |
| 456 return [line[:-3] + line[-1] for line in lines] |
| 457 |
| 458 |
| 459 def main(): |
| 460 if len(sys.argv) != 3: |
| 461 print('usage: %s infile outfile' % sys.argv[0]) |
| 462 return 1 |
| 463 with open(sys.argv[1], 'r') as infile, open(sys.argv[2], 'w') as outfile: |
| 464 outfile.write(words_to_cxx(parse_gperf(infile))) |
| 465 return 0 |
| 466 |
| 467 |
| 468 if __name__ == '__main__': |
| 469 sys.exit(main()) |
OLD | NEW |