| OLD | NEW |
| 1 #!/usr/bin/python | 1 #!/usr/bin/env python |
| 2 # Copyright (c) 2009 The Chromium Authors. All rights reserved. | 2 # Copyright (c) 2011 The Chromium Authors. All rights reserved. |
| 3 # Use of this source code is governed by a BSD-style license that can be | 3 # Use of this source code is governed by a BSD-style license that can be |
| 4 # found in the LICENSE file. | 4 # found in the LICENSE file. |
| 5 | 5 |
| 6 """Process a History database and dump a .dot file suitable for GraphViz. | 6 """Process a History database and dump a .dot file suitable for GraphViz. |
| 7 | 7 |
| 8 This is useful for debugging history redirect flows. | 8 This is useful for debugging history redirect flows. |
| 9 | 9 |
| 10 An example run of this program: | 10 An example run of this program: |
| 11 python /src/history-viz.py History > foo.dot | 11 python /src/history-viz.py History > foo.dot |
| 12 /c/Program\ Files/Graphviz2.18/bin/dot -Tpng foo.dot -o foo.png | 12 /c/Program\ Files/Graphviz2.18/bin/dot -Tpng foo.dot -o foo.png |
| 13 """ | 13 """ |
| 14 | 14 |
| 15 import struct | 15 import struct |
| 16 import subprocess | 16 import subprocess |
| 17 import sys | 17 import sys |
| 18 import urlparse | 18 import urlparse |
| 19 | 19 |
| 20 class URL: | 20 |
| 21 # Some transition types, copied from page_transition_types.h. |
| 22 TRANS_TYPES = { |
| 23 0: 'link', |
| 24 1: 'typed', |
| 25 2: 'most-visited', |
| 26 3: 'auto subframe', |
| 27 7: 'form', |
| 28 } |
| 29 |
| 30 |
| 31 class URL(object): |
| 21 """Represents a broken-down URL from our most visited database.""" | 32 """Represents a broken-down URL from our most visited database.""" |
| 22 | 33 |
| 23 def __init__(self, id, url): | 34 def __init__(self, id, url): |
| 24 """Initialize a new URL object. |id| is the database id of the URL.""" | 35 """Initialize a new URL object. |id| is the database id of the URL.""" |
| 25 self.id = id | 36 self.id = id |
| 26 self.url = url | 37 self.url = url |
| 27 scheme, loc, path, query, fragment = urlparse.urlsplit(url) | 38 scheme, loc, path, query, fragment = urlparse.urlsplit(url) |
| 28 if scheme == 'http': | 39 if scheme == 'http': |
| 29 scheme = '' # Shorten for display purposes. | 40 scheme = '' # Shorten for display purposes. |
| 30 if len(scheme) > 0: | 41 if len(scheme) > 0: |
| (...skipping 29 matching lines...) Expand all Loading... |
| 60 if len(part) > MAX_LEN: | 71 if len(part) > MAX_LEN: |
| 61 part = part[0:MAX_LEN-3] + '...' | 72 part = part[0:MAX_LEN-3] + '...' |
| 62 if len(line)+len(part) > MAX_LEN: | 73 if len(line)+len(part) > MAX_LEN: |
| 63 lines.append(line) | 74 lines.append(line) |
| 64 line = '' | 75 line = '' |
| 65 line += part | 76 line += part |
| 66 if len(line) > 0: | 77 if len(line) > 0: |
| 67 lines.append(line) | 78 lines.append(line) |
| 68 return '\n'.join(lines) | 79 return '\n'.join(lines) |
| 69 | 80 |
| 70 class Edge: | 81 |
| 82 class Edge(object): |
| 71 """Represents an edge in the history graph, connecting two pages. | 83 """Represents an edge in the history graph, connecting two pages. |
| 72 | 84 |
| 73 If a link is traversed twice, it is one Edge with two entries in | 85 If a link is traversed twice, it is one Edge with two entries in |
| 74 the .transitions array.""" | 86 the .transitions array.""" |
| 75 def __init__(self, src, dst): | 87 def __init__(self, src, dst): |
| 76 self.src = src | 88 self.src = src |
| 77 self.dst = dst | 89 self.dst = dst |
| 78 self.transitions = [] | 90 self.transitions = [] |
| 79 | 91 |
| 80 def Transitions(self): | 92 def Transitions(self): |
| 81 """Return a dictionary mapping transition type -> occurences.""" | 93 """Return a dictionary mapping transition type -> occurences.""" |
| 82 all = {} | 94 all = {} |
| 83 for trans in self.transitions: | 95 for trans in self.transitions: |
| 84 all[trans] = all.get(trans, 0) + 1 | 96 all[trans] = all.get(trans, 0) + 1 |
| 85 # We currently don't use the chain type. | 97 # We currently don't use the chain type. |
| 86 # TODO(evanm): make this a command-line option. | 98 # TODO(evanm): make this a command-line option. |
| 87 # if trans & 0x30000000 != 0: | 99 # if trans & 0x30000000 != 0: |
| 88 # chain = '' | 100 # chain = '' |
| 89 # if trans & 0x10000000: | 101 # if trans & 0x10000000: |
| 90 # chain = 'start' | 102 # chain = 'start' |
| 91 # if trans & 0x20000000: | 103 # if trans & 0x20000000: |
| 92 # if len(chain) == 0: | 104 # if len(chain) == 0: |
| 93 # chain = 'end' | 105 # chain = 'end' |
| 94 # else: | 106 # else: |
| 95 # chain = '' | 107 # chain = '' |
| 96 # if len(chain) > 0: | 108 # if len(chain) > 0: |
| 97 # edge['chain'] = chain | 109 # edge['chain'] = chain |
| 98 return all | 110 return all |
| 99 | 111 |
| 112 |
| 100 def ClusterBy(objs, pred): | 113 def ClusterBy(objs, pred): |
| 101 """Group a list of objects by a predicate. | 114 """Group a list of objects by a predicate. |
| 102 | 115 |
| 103 Given a list of objects and a predicate over the objects, return a | 116 Given a list of objects and a predicate over the objects, return a |
| 104 dictionary mapping pred(obj) -> all objs with the same pred(obj).""" | 117 dictionary mapping pred(obj) -> all objs with the same pred(obj).""" |
| 105 clusters = {} | 118 clusters = {} |
| 106 for obj in objs: | 119 for obj in objs: |
| 107 cluster = pred(obj) | 120 cluster = pred(obj) |
| 108 clusters[cluster] = clusters.get(cluster, []) | 121 clusters[cluster] = clusters.get(cluster, []) |
| 109 clusters[cluster].append(obj) | 122 clusters[cluster].append(obj) |
| 110 return clusters | 123 return clusters |
| 111 | 124 |
| 112 def EscapeDot(str): | 125 |
| 126 def EscapeDot(string): |
| 113 """Escape a string suitable for embedding in a graphviz graph.""" | 127 """Escape a string suitable for embedding in a graphviz graph.""" |
| 114 # TODO(evanm): this is likely not sufficient. | 128 # TODO(evanm): this is likely not sufficient. |
| 115 return str.replace('\n', '\\n') | 129 return string.replace('\n', '\\n') |
| 116 | 130 |
| 117 class SQLite: | 131 |
| 132 class SQLite(object): |
| 118 """Trivial interface to executing SQLite queries. | 133 """Trivial interface to executing SQLite queries. |
| 119 Spawns a new process with each call.""" | 134 Spawns a new process with each call.""" |
| 120 def __init__(self, file=None): | 135 def __init__(self, file=None): |
| 121 self.file = file | 136 self.file = file |
| 122 | 137 |
| 123 def Run(self, sql): | 138 def Run(self, sql): |
| 124 """Execute |sql|, yielding each row of results as an array.""" | 139 """Execute |sql|, yielding each row of results as an array.""" |
| 125 subproc = subprocess.Popen(['sqlite', self.file], | 140 subproc = subprocess.Popen(['sqlite', self.file], |
| 126 stdin=subprocess.PIPE, | 141 stdin=subprocess.PIPE, |
| 127 stdout=subprocess.PIPE) | 142 stdout=subprocess.PIPE) |
| 128 subproc.stdin.write('.mode tabs\n') | 143 subproc.stdin.write('.mode tabs\n') |
| 129 subproc.stdin.write(sql + ';') | 144 subproc.stdin.write(sql + ';') |
| 130 subproc.stdin.close() | 145 subproc.stdin.close() |
| 131 for line in subproc.stdout: | 146 for line in subproc.stdout: |
| 132 row = line.strip().split('\t') | 147 row = line.strip().split('\t') |
| 133 yield row | 148 yield row |
| 134 | 149 |
| 150 |
| 135 def LoadHistory(filename): | 151 def LoadHistory(filename): |
| 136 db = SQLite(filename) | 152 db = SQLite(filename) |
| 137 | 153 |
| 138 urls = {} # Map of urlid => url. | 154 urls = {} # Map of urlid => url. |
| 139 urls['0'] = URL('0', 'start') # Node name '0' is our special 'start' node. | 155 urls['0'] = URL('0', 'start') # Node name '0' is our special 'start' node. |
| 140 for id, url in db.Run('SELECT id, url FROM urls'): | 156 for id, url in db.Run('SELECT id, url FROM urls'): |
| 141 urls[id] = URL(id, url) | 157 urls[id] = URL(id, url) |
| 142 | 158 |
| 143 visiturlids = {} # Map of visitid => urlid. | 159 visiturlids = {} # Map of visitid => urlid. |
| 144 visiturlids['0'] = '0' # '0' is our special 'start' node. | 160 visiturlids['0'] = '0' # '0' is our special 'start' node. |
| 145 edges = {} # Map of urlid->urlid->Edge. | 161 edges = {} # Map of urlid->urlid->Edge. |
| 146 for src, dst, url, trans in db.Run('SELECT from_visit, id, url, transition ' | 162 for src, dst, url, trans in db.Run('SELECT from_visit, id, url, transition ' |
| 147 'FROM visits ORDER BY id'): | 163 'FROM visits ORDER BY id'): |
| 148 visiturlids[dst] = url | 164 visiturlids[dst] = url |
| 149 src = visiturlids[src] | 165 src = visiturlids[src] |
| 150 dst = visiturlids[dst] | 166 dst = visiturlids[dst] |
| 151 edges[src] = edges.get(src, {}) | 167 edges[src] = edges.get(src, {}) |
| 152 edge = edges[src][dst] = edges[src].get(dst, Edge(src, dst)) | 168 edge = edges[src][dst] = edges[src].get(dst, Edge(src, dst)) |
| 153 # SQLite outputs transition values as signed integers, but they're really | 169 # SQLite outputs transition values as signed integers, but they're really |
| 154 # a bitfield. Below does "unsigned trans = static_cast<unsigned>(trans)". | 170 # a bitfield. Below does "unsigned trans = static_cast<unsigned>(trans)". |
| 155 trans = struct.unpack('I', struct.pack('i', int(trans)))[0] | 171 trans = struct.unpack('I', struct.pack('i', int(trans)))[0] |
| 156 edge.transitions.append(trans) | 172 edge.transitions.append(trans) |
| 157 | 173 |
| 158 return urls, edges | 174 return urls, edges |
| 159 | 175 |
| 160 # Some transition types, copied from page_transition_types.h. | |
| 161 TRANS_TYPES = { | |
| 162 0: 'link', | |
| 163 1: 'typed', | |
| 164 2: 'most-visited', | |
| 165 3: 'auto subframe', | |
| 166 7: 'form', | |
| 167 } | |
| 168 | 176 |
| 169 urls, edges = LoadHistory(sys.argv[1]) | 177 def main(): |
| 178 urls, edges = LoadHistory(sys.argv[1]) |
| 179 print 'digraph G {' |
| 180 print ' graph [rankdir=LR]' # Display left to right. |
| 181 print ' node [shape=box]' # Display nodes as boxes. |
| 182 print ' subgraph { rank=source; 0 [label="start"] }' |
| 170 | 183 |
| 171 print 'digraph G {' | 184 # Output all the nodes within graph clusters. |
| 172 print ' graph [rankdir=LR]' # Display left to right. | 185 hosts = ClusterBy(urls.values(), lambda url: url.host) |
| 173 print ' node [shape=box]' # Display nodes as boxes. | 186 for i, (host, urls) in enumerate(hosts.items()): |
| 174 print ' subgraph { rank=source; 0 [label="start"] }' | 187 # Cluster all URLs under this host if it has more than one entry. |
| 188 host_clustered = len(urls) > 1 |
| 189 if host_clustered: |
| 190 print 'subgraph clusterhost%d {' % i |
| 191 print ' label="%s"' % host |
| 192 paths = ClusterBy(urls, lambda url: url.path) |
| 193 for j, (path, urls) in enumerate(paths.items()): |
| 194 # Cluster all URLs under this host if it has more than one entry. |
| 195 path_clustered = host_clustered and len(urls) > 1 |
| 196 if path_clustered: |
| 197 print ' subgraph cluster%d%d {' % (i, j) |
| 198 print ' label="%s"' % path |
| 199 for url in urls: |
| 200 if url.id == '0': continue # We already output the special start node. |
| 201 pretty = url.PrettyPrint(include_host=not host_clustered, |
| 202 include_path=not path_clustered) |
| 203 print ' %s [label="%s"]' % (url.id, EscapeDot(pretty)) |
| 204 if path_clustered: |
| 205 print ' }' |
| 206 if host_clustered: |
| 207 print '}' |
| 175 | 208 |
| 176 # Output all the nodes within graph clusters. | 209 # Output all the edges between nodes. |
| 177 hosts = ClusterBy(urls.values(), lambda url: url.host) | 210 for src, dsts in edges.items(): |
| 178 for i, (host, urls) in enumerate(hosts.items()): | 211 for dst, edge in dsts.items(): |
| 179 # Cluster all URLs under this host if it has more than one entry. | 212 # Gather up all the transitions into the label. |
| 180 host_clustered = len(urls) > 1 | 213 label = [] # Label for the edge. |
| 181 if host_clustered: | 214 transitions = edge.Transitions() |
| 182 print 'subgraph clusterhost%d {' % i | 215 for trans, count in transitions.items(): |
| 183 print ' label="%s"' % host | 216 text = '' |
| 184 paths = ClusterBy(urls, lambda url: url.path) | 217 if count > 1: |
| 185 for j, (path, urls) in enumerate(paths.items()): | 218 text = '%dx ' % count |
| 186 # Cluster all URLs under this host if it has more than one entry. | 219 base_type = trans & 0xFF |
| 187 path_clustered = host_clustered and len(urls) > 1 | 220 redir = (trans & 0xC0000000) != 0 |
| 188 if path_clustered: | 221 start = (trans & 0x10000000) != 0 |
| 189 print ' subgraph cluster%d%d {' % (i, j) | 222 end = (trans & 0x20000000) != 0 |
| 190 print ' label="%s"' % path | 223 if start or end: |
| 191 for url in urls: | 224 if start: |
| 192 if url.id == '0': continue # We already output the special start node. | 225 text += '<' |
| 193 pretty = url.PrettyPrint(include_host=not host_clustered, | 226 if end: |
| 194 include_path=not path_clustered) | 227 text += '>' |
| 195 print ' %s [label="%s"]' % (url.id, EscapeDot(pretty)) | 228 text += ' ' |
| 196 if path_clustered: | 229 if redir: |
| 197 print ' }' | 230 text += 'R ' |
| 198 if host_clustered: | 231 text += TRANS_TYPES.get(base_type, 'trans%d' % base_type) |
| 199 print '}' | 232 label.append(text) |
| 233 if len(label) == 0: |
| 234 continue |
| 200 | 235 |
| 201 # Output all the edges between nodes. | 236 edgeattrs = [] # Graphviz attributes for the edge. |
| 202 for src, dsts in edges.items(): | 237 # If the edge is from the start and the transitions are fishy, make it |
| 203 for dst, edge in dsts.items(): | 238 # display as a dotted line. |
| 204 # Gather up all the transitions into the label. | 239 if src == '0' and len(transitions.keys()) == 1 and transitions.has_key(0): |
| 205 label = [] # Label for the edge. | 240 edgeattrs.append('style=dashed') |
| 206 transitions = edge.Transitions() | 241 if len(label) > 0: |
| 207 for trans, count in transitions.items(): | 242 edgeattrs.append('label="%s"' % EscapeDot('\n'.join(label))) |
| 208 text = '' | |
| 209 if count > 1: | |
| 210 text = '%dx ' % count | |
| 211 base_type = trans & 0xFF | |
| 212 redir = (trans & 0xC0000000) != 0 | |
| 213 start = (trans & 0x10000000) != 0 | |
| 214 end = (trans & 0x20000000) != 0 | |
| 215 if start or end: | |
| 216 if start: | |
| 217 text += '<' | |
| 218 if end: | |
| 219 text += '>' | |
| 220 text += ' ' | |
| 221 if redir: | |
| 222 text += 'R ' | |
| 223 text += TRANS_TYPES.get(base_type, 'trans%d' % base_type) | |
| 224 label.append(text) | |
| 225 if len(label) == 0: | |
| 226 continue | |
| 227 | 243 |
| 228 edgeattrs = [] # Graphviz attributes for the edge. | 244 out = '%s -> %s' % (src, dst) |
| 229 # If the edge is from the start and the transitions are fishy, make it | 245 if len(edgeattrs) > 0: |
| 230 # display as a dotted line. | 246 out += ' [%s]' % ','.join(edgeattrs) |
| 231 if src == '0' and len(transitions.keys()) == 1 and transitions.has_key(0): | 247 print out |
| 232 edgeattrs.append('style=dashed') | 248 print '}' |
| 233 if len(label) > 0: | 249 return 0 |
| 234 edgeattrs.append('label="%s"' % EscapeDot('\n'.join(label))) | |
| 235 | 250 |
| 236 out = '%s -> %s' % (src, dst) | |
| 237 if len(edgeattrs) > 0: | |
| 238 out += ' [%s]' % ','.join(edgeattrs) | |
| 239 print out | |
| 240 print '}' | |
| 241 | 251 |
| 252 if __name__ == '__main__': |
| 253 sys.exit(main()) |
| OLD | NEW |