| Index: net/tools/tld_cleanup/make_dafsa.py
|
| diff --git a/net/tools/tld_cleanup/make_dafsa.py b/net/tools/tld_cleanup/make_dafsa.py
|
| new file mode 100755
|
| index 0000000000000000000000000000000000000000..775ff4fa33a2fa15934264b1dbf6b457e36bc974
|
| --- /dev/null
|
| +++ b/net/tools/tld_cleanup/make_dafsa.py
|
| @@ -0,0 +1,423 @@
|
| +#!/usr/bin/python
|
| +
|
| +'''
|
| +A Deterministic acyclic finite state automaton (DAFSA) is a compact
|
| +representation of an unordered word list (dictionary).
|
| +
|
| +http://en.wikipedia.org/wiki/Deterministic_acyclic_finite_state_automaton
|
| +
|
| +This python program converts a list of strings to a byte array in C++.
|
| +This python program fetches strings and return values from a gperf file
|
| +and generates a C++ file with a byte array representing graph that can be
|
| +used as a memory efficient replacement for the perfect hash table.
|
| +
|
| +The input strings are assumed to consist of printable 7-bit ASCII characters
|
| +and the return values are assumed to be one digit integers.
|
| +
|
| +In this program a DAFSA is a diamond shaped graph starting at a common
|
| +source node and ending at a common sink node. All internal nodes contain
|
| +a label and each word is represented by the labels in one path from
|
| +the source node to the sink node.
|
| +
|
| +The following python represention is used for nodes:
|
| +
|
| + Source node: [ children ]
|
| + Internal node: (label, [ children ])
|
| + Sink node: None
|
| +
|
| +The graph is first compressed by prefixes like a trie. In the next step
|
| +suffixes are compressed so that the graph gets diamond shaped. Finally
|
| +one to one linked nodes are replaced by nodes with the labels joined.
|
| +
|
| +The order of the operations is crucial since lookups will be performed
|
| +starting from the source with no backtracking. Thus a node must have at
|
| +most one child with a label starting by the same character. The output
|
| +is also arranged so that all jumps are to increasing addresses, thus forward
|
| +in memory.
|
| +
|
| +The generated output has suffix free decoding so that the sign of leading
|
| +bits in a link (a reference to a child node) indicate if it has a size of one,
|
| +two or three bytes and if it is the last outgoing link from the actual node.
|
| +A node label is terminated by a byte with the leading bit set.
|
| +
|
| +The generated byte array can described by the following BNF:
|
| +
|
| +<byte> ::= < 8-bit value in range [0x00-0xFF] >
|
| +
|
| +<char> ::= < printable 7-bit ASCII character, byte in range [0x20-0x7F] >
|
| +<end_char> ::= < char + 0x80, byte in range [0xA0-0xFF] >
|
| +<return value> ::= < value + 0x80, byte in range [0x80-0x8F] >
|
| +
|
| +<offset1> ::= < byte in range [0x00-0x3F] >
|
| +<offset2> ::= < byte in range [0x40-0x5F] >
|
| +<offset3> ::= < byte in range [0x60-0x7F] >
|
| +
|
| +<end_offset1> ::= < byte in range [0x80-0xBF] >
|
| +<end_offset2> ::= < byte in range [0xC0-0xDF] >
|
| +<end_offset3> ::= < byte in range [0xE0-0xFF] >
|
| +
|
| +<label> ::= <end_char>
|
| + | <char> <label>
|
| +
|
| +<end_label> ::= <return_value>
|
| + | <char> <end_label>
|
| +
|
| +<offset> ::= <offset1>
|
| + | <offset2> <byte>
|
| + | <offset3> <byte> <byte>
|
| +
|
| +<end_offset> ::= <end_offset1>
|
| + | <end_offset2> <byte>
|
| + | <end_offset3> <byte> <byte>
|
| +
|
| +<offsets> ::= <end_offset>
|
| + | <offset> <offsets>
|
| +
|
| +<source> ::= <offsets>
|
| +
|
| +<node> ::= <label> <offsets>
|
| + | <end_label>
|
| +
|
| +<dafsa> ::= <source>
|
| + | <dafsa> <node>
|
| +
|
| +Decoding:
|
| +
|
| +<char> -> printable 7-bit ASCII character
|
| +<end_char> & 0x7F -> printable 7-bit ASCII character
|
| +<return value> & 0x0F -> integer
|
| +<offset1 & 0x3F> -> integer
|
| +((<offset2> & 0x1F>) << 8) + <byte> -> integer
|
| +((<offset3> & 0x1F>) << 16) + (<byte> << 8) + <byte> -> integer
|
| +
|
| +end_offset1, end_offset2 and and_offset3 are decoded same as offset1,
|
| +offset2 and offset3 respectively.
|
| +
|
| +The first offset in a list of offsets is the distance in bytes between the
|
| +offset itself and the first child node. Subsequent offsets are the distance
|
| +between previous child node and next child node. Thus each offset links a node
|
| +to a child node. The distance is always counted between start addresses, i.e.
|
| +first byte in decoded offset or first byte in child node.
|
| +
|
| +Example 1:
|
| +
|
| +%%
|
| +aa, 1
|
| +a, 2
|
| +%%
|
| +
|
| +The input is first parsed to a list of words:
|
| +["aa1", "a2"]
|
| +
|
| +A fully expanded graph is created from the words:
|
| +source = [node1, node4]
|
| +node1 = ("a", [node2])
|
| +node2 = ("a", [node3])
|
| +node3 = ("\x01", [sink])
|
| +node4 = ("a", [node5])
|
| +node5 = ("\x02", [sink])
|
| +sink = None
|
| +
|
| +Compression results in the following graph:
|
| +source = [node1]
|
| +node1 = ("a", [node2, node3])
|
| +node2 = ("\x02", [sink])
|
| +node3 = ("a\x01", [sink])
|
| +sink = None
|
| +
|
| +A C++ representation of the compressed graph is generated:
|
| +
|
| +const unsigned char dafsa[7] = {
|
| + 0x81, 0xE1, 0x02, 0x81, 0x82, 0x61, 0x81,
|
| +};
|
| +
|
| +The bytes in the generated array has the following meaning:
|
| +
|
| + 0: 0x81 <end_offset1> child at position 0 + (0x81 & 0x3F) -> jump to 1
|
| +
|
| + 1: 0xE1 <end_char> label character (0xE1 & 0x7F) -> match "a"
|
| + 2: 0x02 <offset1> child at position 2 + (0x02 & 0x3F) -> jump to 4
|
| +
|
| + 3: 0x81 <end_offset1> child at position 4 + (0x81 & 0x3F) -> jump to 5
|
| + 4: 0x82 <return_value> 0x82 & 0x0F -> return 2
|
| +
|
| + 5: 0x61 <char> label character 0x61 -> match "a"
|
| + 6: 0x81 <return_value> 0x81 & 0x0F -> return 1
|
| +
|
| +Example 2:
|
| +
|
| +%%
|
| +aa, 1
|
| +bbb, 2
|
| +baa, 1
|
| +%%
|
| +
|
| +The input is first parsed to a list of words:
|
| +["aa1", "bbb2", "baa1"]
|
| +
|
| +Compression results in the following graph:
|
| +source = [node1, node2]
|
| +node1 = ("b", [node2, node3])
|
| +node2 = ("aa\x01", [sink])
|
| +node3 = ("bb\x02", [sink])
|
| +sink = None
|
| +
|
| +A C++ representation of the compressed graph is generated:
|
| +
|
| +const unsigned char dafsa[11] = {
|
| + 0x02, 0x83, 0xE2, 0x02, 0x83, 0x61, 0x61, 0x81, 0x62, 0x62, 0x82,
|
| +};
|
| +
|
| +The bytes in the generated array has the following meaning:
|
| +
|
| + 0: 0x02 <offset1> child at position 0 + (0x02 & 0x3F) -> jump to 2
|
| + 1: 0x83 <end_offset1> child at position 2 + (0x83 & 0x3F) -> jump to 5
|
| +
|
| + 2: 0xE2 <end_char> label character (0xE2 & 0x7F) -> match "b"
|
| + 3: 0x02 <offset1> child at position 3 + (0x02 & 0x3F) -> jump to 5
|
| + 4: 0x83 <end_offset1> child at position 5 + (0x83 & 0x3F) -> jump to 8
|
| +
|
| + 5: 0x61 <char> label character 0x61 -> match "a"
|
| + 6: 0x61 <char> label character 0x61 -> match "a"
|
| + 7: 0x81 <return_value> 0x81 & 0x0F -> return 1
|
| +
|
| + 8: 0x62 <char> label character 0x62 -> match "b"
|
| + 9: 0x62 <char> label character 0x62 -> match "b"
|
| +10: 0x82 <return_value> 0x82 & 0x0F -> return 2
|
| +'''
|
| +
|
| +import sys
|
| +
|
| +def ToDafsa(words):
|
| + '''
|
| + Generate a DAFSA from a word list and return the source node
|
| +
|
| + Each word is split into characters so that each character is represented by
|
| + a unique node. It is assumed the word list is not empty.
|
| + '''
|
| + assert(words)
|
| + def ToNodes(word):
|
| + '''Split words into characters'''
|
| + # Must be printable 7-bit ASCII.
|
| + assert(0x1F < ord(word[0]) < 0x80)
|
| + if len(word) == 1:
|
| + return chr(ord(word[0]) & 0x0F), [None]
|
| + return word[0], [ToNodes(word[1:])]
|
| + return [ToNodes(word) for word in words]
|
| +
|
| +def ToWords(node):
|
| + '''
|
| + Generate a word list from all paths starting from an internal node
|
| + '''
|
| + if node is None:
|
| + return ['']
|
| + return [(node[0] + word) for child in node[1] for word in ToWords(child)]
|
| +
|
| +def Reverse(dafsa):
|
| + '''
|
| + Generate a new DAFSA that is reversed, so that the old sink node becomes the
|
| + new source node.
|
| + '''
|
| + def Dfs(node, parent):
|
| + '''Create new nodes'''
|
| + if node is None:
|
| + sink.append(parent)
|
| + elif id(node) not in nodemap:
|
| + nodemap[id(node)] = (node[0][::-1], [parent])
|
| + for child in node[1]:
|
| + Dfs(child, nodemap[id(node)])
|
| + else:
|
| + nodemap[id(node)][1].append(parent)
|
| +
|
| + sink = []
|
| + nodemap = {}
|
| + for node in dafsa:
|
| + Dfs(node, None)
|
| + return sink
|
| +
|
| +def JoinLabels(dafsa):
|
| + '''
|
| + Generate a new DAFSA where internal nodes are merged if there is a one to
|
| + one connection.
|
| + '''
|
| + def CountParents(node):
|
| + '''Count incoming references'''
|
| + if id(node) in parentcount:
|
| + parentcount[id(node)] += 1
|
| + else:
|
| + parentcount[id(node)] = 1
|
| + for child in node[1]:
|
| + CountParents(child)
|
| +
|
| + def Join(node):
|
| + '''Create new nodes'''
|
| + if id(node) not in nodemap:
|
| + children = [Join(child) for child in node[1]]
|
| + if len(children) == 1 and parentcount[id(node[1][0])] == 1:
|
| + child = children[0]
|
| + nodemap[id(node)] = (node[0] + child[0], child[1])
|
| + else:
|
| + nodemap[id(node)] = (node[0], children)
|
| + return nodemap[id(node)]
|
| +
|
| + parentcount = { id(None): 2 }
|
| + for node in dafsa:
|
| + CountParents(node)
|
| + nodemap = { id(None): None }
|
| + return [Join(node) for node in dafsa]
|
| +
|
| +def JoinSuffixes(dafsa):
|
| + '''
|
| + Generate a new DAFSA where nodes that represent the same word lists towards
|
| + the sink are merged.
|
| + '''
|
| + def Join(node):
|
| + '''Return a macthing node. A new node is created if not matching node
|
| + exists. The graph is accessed in dfs order.
|
| + '''
|
| + suffixes = frozenset(ToWords(node))
|
| + if suffixes not in nodemap:
|
| + nodemap[suffixes] = (node[0], [Join(child) for child in node[1]])
|
| + return nodemap[suffixes]
|
| +
|
| + nodemap = { frozenset(('',)): None }
|
| + return [Join(node) for node in dafsa]
|
| +
|
| +def TopSort(dafsa):
|
| + '''Generate list of nodes in topological sort order'''
|
| + def CountIncoming(node):
|
| + '''Count incoming references'''
|
| + if node is not None:
|
| + if id(node) not in incoming:
|
| + incoming[id(node)] = 1
|
| + for child in node[1]:
|
| + CountIncoming(child)
|
| + else:
|
| + incoming[id(node)] += 1
|
| +
|
| + incoming = {}
|
| + for node in dafsa:
|
| + CountIncoming(node)
|
| +
|
| + for node in dafsa:
|
| + incoming[id(node)] -= 1
|
| +
|
| + waiting = [node for node in dafsa if incoming[id(node)] == 0]
|
| + nodes = []
|
| +
|
| + while waiting:
|
| + node = waiting.pop()
|
| + assert(incoming[id(node)] == 0)
|
| + nodes.append(node)
|
| + for child in node[1]:
|
| + if child is not None:
|
| + incoming[id(child)] -= 1
|
| + if incoming[id(child)] == 0:
|
| + waiting.append(child)
|
| + return nodes
|
| +
|
| +def EncodeLinks(children, offsets, current):
|
| + '''Encode a list of children as one, two or three byte offsets'''
|
| + if children[0] is None:
|
| + # This is an <end_label> node and no links are follow in such nodes
|
| + assert(len(children) == 1)
|
| + return []
|
| + guess = 3 * len(children)
|
| + assert(children)
|
| + while True:
|
| + offset = current + guess
|
| + buf = []
|
| + for child in sorted(children, key = lambda x: -offsets[id(x)]):
|
| + last = len(buf)
|
| + distance = offset - offsets[id(child)]
|
| + assert(distance > 0 and distance < (1 << 21))
|
| +
|
| + if distance < (1 << 6):
|
| + # A 6-bit offset: "s0xxxxxx"
|
| + buf.append(distance)
|
| + elif distance < (1 << 13):
|
| + # A 13-bit offset: "s10xxxxxxxxxxxxx"
|
| + buf.append(0x40 | (distance >> 8))
|
| + buf.append(distance & 0xFF)
|
| + else:
|
| + # A 21-bit offset: "s11xxxxxxxxxxxxxxxxxxxxx"
|
| + buf.append(0x60 | (distance >> 16))
|
| + buf.append((distance >> 8) & 0xFF)
|
| + buf.append(distance & 0xFF)
|
| + # Distance in first link is relative to following record.
|
| + # Distance in other links are relative to previous link.
|
| + offset -= distance
|
| + if len(buf) == guess:
|
| + break
|
| + guess = len(buf)
|
| + # Set most significant bit to mark end of links in this node.
|
| + buf[last] |= (1 << 7)
|
| + buf.reverse()
|
| + return buf
|
| +
|
| +def EncodeLabel(label):
|
| + '''
|
| + Encode a node label as a list of bytes with a trailing high byte >0x80.
|
| + '''
|
| + assert(label)
|
| + buf = [ord(c) for c in label]
|
| + buf.reverse()
|
| + # Set most significant bit to mark end of label in this node.
|
| + buf[0] |= (1 << 7)
|
| + return buf
|
| +
|
| +def Encode(dafsa):
|
| + '''Encode a DAFSA to a list of bytes'''
|
| + output = []
|
| + offsets = {}
|
| +
|
| + for node in reversed(TopSort(dafsa)):
|
| + output += EncodeLinks(node[1], offsets, len(output))
|
| + output += EncodeLabel(node[0])
|
| + offsets[id(node)] = len(output)
|
| +
|
| + output += EncodeLinks(dafsa, offsets, len(output))
|
| + output.reverse()
|
| + return output
|
| +
|
| +def ToCxx(data):
|
| + '''Generate C++ code from a list of encoded bytes'''
|
| + text = '/* This file is generated. Don\'t edit!\n\n'
|
| + text += ' The following command was used to generate the file:\n\n'
|
| + text += ' \"' + ' '.join(sys.argv) + '\"\n\n'
|
| + text += '*/\n\n'
|
| + text += 'const unsigned char kDafsa[%s] = {\n' % len(data)
|
| + for i in range(0, len(data), 12):
|
| + text += ' '
|
| + text += ', '.join(['0x%02x' % byte for byte in data[i:i + 12]])
|
| + text += ',\n'
|
| + text += '};\n'
|
| + return text
|
| +
|
| +def WordsToCxx(words):
|
| + '''Generate C++ code from a word list'''
|
| + dafsa = ToDafsa(words)
|
| + for fun in (Reverse, JoinSuffixes, Reverse, JoinSuffixes, JoinLabels):
|
| + dafsa = fun(dafsa)
|
| + byte_values = Encode(dafsa)
|
| + return ToCxx(byte_values)
|
| +
|
| +def ParseGperf(infile):
|
| + '''Parse gperf file and extract strings and return code'''
|
| + lines = [line.strip() for line in infile]
|
| + # Extract strings after the first '%%' and before the second '%%'.
|
| + begin = lines.index('%%') + 1
|
| + end = lines.index('%%', begin)
|
| + lines = lines[begin:end]
|
| + for line in lines:
|
| + assert(line[-3:-1] == ', ')
|
| + assert(line[-1] in '01234')
|
| + return [line[:-3] + line[-1] for line in lines]
|
| +
|
| +if __name__ == '__main__':
|
| + if len(sys.argv) != 3:
|
| + print('usage: %s infile outfile' % sys.argv[0])
|
| + sys.exit(-1)
|
| + INFILE = open(sys.argv[1], 'r')
|
| + OUTFILE = open(sys.argv[2], 'w')
|
| + OUTFILE.write(WordsToCxx(ParseGperf(INFILE)))
|
|
|