OLD | NEW |
1 #!/usr/bin/python | 1 #!/usr/bin/env python |
2 # Copyright (c) 2009 The Chromium Authors. All rights reserved. | 2 # Copyright (c) 2011 The Chromium Authors. All rights reserved. |
3 # Use of this source code is governed by a BSD-style license that can be | 3 # Use of this source code is governed by a BSD-style license that can be |
4 # found in the LICENSE file. | 4 # found in the LICENSE file. |
5 | 5 |
6 """Process a History database and dump a .dot file suitable for GraphViz. | 6 """Process a History database and dump a .dot file suitable for GraphViz. |
7 | 7 |
8 This is useful for debugging history redirect flows. | 8 This is useful for debugging history redirect flows. |
9 | 9 |
10 An example run of this program: | 10 An example run of this program: |
11 python /src/history-viz.py History > foo.dot | 11 python /src/history-viz.py History > foo.dot |
12 /c/Program\ Files/Graphviz2.18/bin/dot -Tpng foo.dot -o foo.png | 12 /c/Program\ Files/Graphviz2.18/bin/dot -Tpng foo.dot -o foo.png |
13 """ | 13 """ |
14 | 14 |
15 import struct | 15 import struct |
16 import subprocess | 16 import subprocess |
17 import sys | 17 import sys |
18 import urlparse | 18 import urlparse |
19 | 19 |
20 class URL: | 20 |
| 21 # Some transition types, copied from page_transition_types.h. |
| 22 TRANS_TYPES = { |
| 23 0: 'link', |
| 24 1: 'typed', |
| 25 2: 'most-visited', |
| 26 3: 'auto subframe', |
| 27 7: 'form', |
| 28 } |
| 29 |
| 30 |
| 31 class URL(object): |
21 """Represents a broken-down URL from our most visited database.""" | 32 """Represents a broken-down URL from our most visited database.""" |
22 | 33 |
23 def __init__(self, id, url): | 34 def __init__(self, id, url): |
24 """Initialize a new URL object. |id| is the database id of the URL.""" | 35 """Initialize a new URL object. |id| is the database id of the URL.""" |
25 self.id = id | 36 self.id = id |
26 self.url = url | 37 self.url = url |
27 scheme, loc, path, query, fragment = urlparse.urlsplit(url) | 38 scheme, loc, path, query, fragment = urlparse.urlsplit(url) |
28 if scheme == 'http': | 39 if scheme == 'http': |
29 scheme = '' # Shorten for display purposes. | 40 scheme = '' # Shorten for display purposes. |
30 if len(scheme) > 0: | 41 if len(scheme) > 0: |
(...skipping 29 matching lines...) Expand all Loading... |
60 if len(part) > MAX_LEN: | 71 if len(part) > MAX_LEN: |
61 part = part[0:MAX_LEN-3] + '...' | 72 part = part[0:MAX_LEN-3] + '...' |
62 if len(line)+len(part) > MAX_LEN: | 73 if len(line)+len(part) > MAX_LEN: |
63 lines.append(line) | 74 lines.append(line) |
64 line = '' | 75 line = '' |
65 line += part | 76 line += part |
66 if len(line) > 0: | 77 if len(line) > 0: |
67 lines.append(line) | 78 lines.append(line) |
68 return '\n'.join(lines) | 79 return '\n'.join(lines) |
69 | 80 |
70 class Edge: | 81 |
| 82 class Edge(object): |
71 """Represents an edge in the history graph, connecting two pages. | 83 """Represents an edge in the history graph, connecting two pages. |
72 | 84 |
73 If a link is traversed twice, it is one Edge with two entries in | 85 If a link is traversed twice, it is one Edge with two entries in |
74 the .transitions array.""" | 86 the .transitions array.""" |
75 def __init__(self, src, dst): | 87 def __init__(self, src, dst): |
76 self.src = src | 88 self.src = src |
77 self.dst = dst | 89 self.dst = dst |
78 self.transitions = [] | 90 self.transitions = [] |
79 | 91 |
80 def Transitions(self): | 92 def Transitions(self): |
81 """Return a dictionary mapping transition type -> occurences.""" | 93 """Return a dictionary mapping transition type -> occurences.""" |
82 all = {} | 94 all = {} |
83 for trans in self.transitions: | 95 for trans in self.transitions: |
84 all[trans] = all.get(trans, 0) + 1 | 96 all[trans] = all.get(trans, 0) + 1 |
85 # We currently don't use the chain type. | 97 # We currently don't use the chain type. |
86 # TODO(evanm): make this a command-line option. | 98 # TODO(evanm): make this a command-line option. |
87 # if trans & 0x30000000 != 0: | 99 # if trans & 0x30000000 != 0: |
88 # chain = '' | 100 # chain = '' |
89 # if trans & 0x10000000: | 101 # if trans & 0x10000000: |
90 # chain = 'start' | 102 # chain = 'start' |
91 # if trans & 0x20000000: | 103 # if trans & 0x20000000: |
92 # if len(chain) == 0: | 104 # if len(chain) == 0: |
93 # chain = 'end' | 105 # chain = 'end' |
94 # else: | 106 # else: |
95 # chain = '' | 107 # chain = '' |
96 # if len(chain) > 0: | 108 # if len(chain) > 0: |
97 # edge['chain'] = chain | 109 # edge['chain'] = chain |
98 return all | 110 return all |
99 | 111 |
| 112 |
100 def ClusterBy(objs, pred): | 113 def ClusterBy(objs, pred): |
101 """Group a list of objects by a predicate. | 114 """Group a list of objects by a predicate. |
102 | 115 |
103 Given a list of objects and a predicate over the objects, return a | 116 Given a list of objects and a predicate over the objects, return a |
104 dictionary mapping pred(obj) -> all objs with the same pred(obj).""" | 117 dictionary mapping pred(obj) -> all objs with the same pred(obj).""" |
105 clusters = {} | 118 clusters = {} |
106 for obj in objs: | 119 for obj in objs: |
107 cluster = pred(obj) | 120 cluster = pred(obj) |
108 clusters[cluster] = clusters.get(cluster, []) | 121 clusters[cluster] = clusters.get(cluster, []) |
109 clusters[cluster].append(obj) | 122 clusters[cluster].append(obj) |
110 return clusters | 123 return clusters |
111 | 124 |
112 def EscapeDot(str): | 125 |
| 126 def EscapeDot(string): |
113 """Escape a string suitable for embedding in a graphviz graph.""" | 127 """Escape a string suitable for embedding in a graphviz graph.""" |
114 # TODO(evanm): this is likely not sufficient. | 128 # TODO(evanm): this is likely not sufficient. |
115 return str.replace('\n', '\\n') | 129 return string.replace('\n', '\\n') |
116 | 130 |
117 class SQLite: | 131 |
| 132 class SQLite(object): |
118 """Trivial interface to executing SQLite queries. | 133 """Trivial interface to executing SQLite queries. |
119 Spawns a new process with each call.""" | 134 Spawns a new process with each call.""" |
120 def __init__(self, file=None): | 135 def __init__(self, file=None): |
121 self.file = file | 136 self.file = file |
122 | 137 |
123 def Run(self, sql): | 138 def Run(self, sql): |
124 """Execute |sql|, yielding each row of results as an array.""" | 139 """Execute |sql|, yielding each row of results as an array.""" |
125 subproc = subprocess.Popen(['sqlite', self.file], | 140 subproc = subprocess.Popen(['sqlite', self.file], |
126 stdin=subprocess.PIPE, | 141 stdin=subprocess.PIPE, |
127 stdout=subprocess.PIPE) | 142 stdout=subprocess.PIPE) |
128 subproc.stdin.write('.mode tabs\n') | 143 subproc.stdin.write('.mode tabs\n') |
129 subproc.stdin.write(sql + ';') | 144 subproc.stdin.write(sql + ';') |
130 subproc.stdin.close() | 145 subproc.stdin.close() |
131 for line in subproc.stdout: | 146 for line in subproc.stdout: |
132 row = line.strip().split('\t') | 147 row = line.strip().split('\t') |
133 yield row | 148 yield row |
134 | 149 |
| 150 |
135 def LoadHistory(filename): | 151 def LoadHistory(filename): |
136 db = SQLite(filename) | 152 db = SQLite(filename) |
137 | 153 |
138 urls = {} # Map of urlid => url. | 154 urls = {} # Map of urlid => url. |
139 urls['0'] = URL('0', 'start') # Node name '0' is our special 'start' node. | 155 urls['0'] = URL('0', 'start') # Node name '0' is our special 'start' node. |
140 for id, url in db.Run('SELECT id, url FROM urls'): | 156 for id, url in db.Run('SELECT id, url FROM urls'): |
141 urls[id] = URL(id, url) | 157 urls[id] = URL(id, url) |
142 | 158 |
143 visiturlids = {} # Map of visitid => urlid. | 159 visiturlids = {} # Map of visitid => urlid. |
144 visiturlids['0'] = '0' # '0' is our special 'start' node. | 160 visiturlids['0'] = '0' # '0' is our special 'start' node. |
145 edges = {} # Map of urlid->urlid->Edge. | 161 edges = {} # Map of urlid->urlid->Edge. |
146 for src, dst, url, trans in db.Run('SELECT from_visit, id, url, transition ' | 162 for src, dst, url, trans in db.Run('SELECT from_visit, id, url, transition ' |
147 'FROM visits ORDER BY id'): | 163 'FROM visits ORDER BY id'): |
148 visiturlids[dst] = url | 164 visiturlids[dst] = url |
149 src = visiturlids[src] | 165 src = visiturlids[src] |
150 dst = visiturlids[dst] | 166 dst = visiturlids[dst] |
151 edges[src] = edges.get(src, {}) | 167 edges[src] = edges.get(src, {}) |
152 edge = edges[src][dst] = edges[src].get(dst, Edge(src, dst)) | 168 edge = edges[src][dst] = edges[src].get(dst, Edge(src, dst)) |
153 # SQLite outputs transition values as signed integers, but they're really | 169 # SQLite outputs transition values as signed integers, but they're really |
154 # a bitfield. Below does "unsigned trans = static_cast<unsigned>(trans)". | 170 # a bitfield. Below does "unsigned trans = static_cast<unsigned>(trans)". |
155 trans = struct.unpack('I', struct.pack('i', int(trans)))[0] | 171 trans = struct.unpack('I', struct.pack('i', int(trans)))[0] |
156 edge.transitions.append(trans) | 172 edge.transitions.append(trans) |
157 | 173 |
158 return urls, edges | 174 return urls, edges |
159 | 175 |
160 # Some transition types, copied from page_transition_types.h. | |
161 TRANS_TYPES = { | |
162 0: 'link', | |
163 1: 'typed', | |
164 2: 'most-visited', | |
165 3: 'auto subframe', | |
166 7: 'form', | |
167 } | |
168 | 176 |
169 urls, edges = LoadHistory(sys.argv[1]) | 177 def main(): |
| 178 urls, edges = LoadHistory(sys.argv[1]) |
| 179 print 'digraph G {' |
| 180 print ' graph [rankdir=LR]' # Display left to right. |
| 181 print ' node [shape=box]' # Display nodes as boxes. |
| 182 print ' subgraph { rank=source; 0 [label="start"] }' |
170 | 183 |
171 print 'digraph G {' | 184 # Output all the nodes within graph clusters. |
172 print ' graph [rankdir=LR]' # Display left to right. | 185 hosts = ClusterBy(urls.values(), lambda url: url.host) |
173 print ' node [shape=box]' # Display nodes as boxes. | 186 for i, (host, urls) in enumerate(hosts.items()): |
174 print ' subgraph { rank=source; 0 [label="start"] }' | 187 # Cluster all URLs under this host if it has more than one entry. |
| 188 host_clustered = len(urls) > 1 |
| 189 if host_clustered: |
| 190 print 'subgraph clusterhost%d {' % i |
| 191 print ' label="%s"' % host |
| 192 paths = ClusterBy(urls, lambda url: url.path) |
| 193 for j, (path, urls) in enumerate(paths.items()): |
| 194 # Cluster all URLs under this host if it has more than one entry. |
| 195 path_clustered = host_clustered and len(urls) > 1 |
| 196 if path_clustered: |
| 197 print ' subgraph cluster%d%d {' % (i, j) |
| 198 print ' label="%s"' % path |
| 199 for url in urls: |
| 200 if url.id == '0': continue # We already output the special start node. |
| 201 pretty = url.PrettyPrint(include_host=not host_clustered, |
| 202 include_path=not path_clustered) |
| 203 print ' %s [label="%s"]' % (url.id, EscapeDot(pretty)) |
| 204 if path_clustered: |
| 205 print ' }' |
| 206 if host_clustered: |
| 207 print '}' |
175 | 208 |
176 # Output all the nodes within graph clusters. | 209 # Output all the edges between nodes. |
177 hosts = ClusterBy(urls.values(), lambda url: url.host) | 210 for src, dsts in edges.items(): |
178 for i, (host, urls) in enumerate(hosts.items()): | 211 for dst, edge in dsts.items(): |
179 # Cluster all URLs under this host if it has more than one entry. | 212 # Gather up all the transitions into the label. |
180 host_clustered = len(urls) > 1 | 213 label = [] # Label for the edge. |
181 if host_clustered: | 214 transitions = edge.Transitions() |
182 print 'subgraph clusterhost%d {' % i | 215 for trans, count in transitions.items(): |
183 print ' label="%s"' % host | 216 text = '' |
184 paths = ClusterBy(urls, lambda url: url.path) | 217 if count > 1: |
185 for j, (path, urls) in enumerate(paths.items()): | 218 text = '%dx ' % count |
186 # Cluster all URLs under this host if it has more than one entry. | 219 base_type = trans & 0xFF |
187 path_clustered = host_clustered and len(urls) > 1 | 220 redir = (trans & 0xC0000000) != 0 |
188 if path_clustered: | 221 start = (trans & 0x10000000) != 0 |
189 print ' subgraph cluster%d%d {' % (i, j) | 222 end = (trans & 0x20000000) != 0 |
190 print ' label="%s"' % path | 223 if start or end: |
191 for url in urls: | 224 if start: |
192 if url.id == '0': continue # We already output the special start node. | 225 text += '<' |
193 pretty = url.PrettyPrint(include_host=not host_clustered, | 226 if end: |
194 include_path=not path_clustered) | 227 text += '>' |
195 print ' %s [label="%s"]' % (url.id, EscapeDot(pretty)) | 228 text += ' ' |
196 if path_clustered: | 229 if redir: |
197 print ' }' | 230 text += 'R ' |
198 if host_clustered: | 231 text += TRANS_TYPES.get(base_type, 'trans%d' % base_type) |
199 print '}' | 232 label.append(text) |
| 233 if len(label) == 0: |
| 234 continue |
200 | 235 |
201 # Output all the edges between nodes. | 236 edgeattrs = [] # Graphviz attributes for the edge. |
202 for src, dsts in edges.items(): | 237 # If the edge is from the start and the transitions are fishy, make it |
203 for dst, edge in dsts.items(): | 238 # display as a dotted line. |
204 # Gather up all the transitions into the label. | 239 if src == '0' and len(transitions.keys()) == 1 and transitions.has_key(0): |
205 label = [] # Label for the edge. | 240 edgeattrs.append('style=dashed') |
206 transitions = edge.Transitions() | 241 if len(label) > 0: |
207 for trans, count in transitions.items(): | 242 edgeattrs.append('label="%s"' % EscapeDot('\n'.join(label))) |
208 text = '' | |
209 if count > 1: | |
210 text = '%dx ' % count | |
211 base_type = trans & 0xFF | |
212 redir = (trans & 0xC0000000) != 0 | |
213 start = (trans & 0x10000000) != 0 | |
214 end = (trans & 0x20000000) != 0 | |
215 if start or end: | |
216 if start: | |
217 text += '<' | |
218 if end: | |
219 text += '>' | |
220 text += ' ' | |
221 if redir: | |
222 text += 'R ' | |
223 text += TRANS_TYPES.get(base_type, 'trans%d' % base_type) | |
224 label.append(text) | |
225 if len(label) == 0: | |
226 continue | |
227 | 243 |
228 edgeattrs = [] # Graphviz attributes for the edge. | 244 out = '%s -> %s' % (src, dst) |
229 # If the edge is from the start and the transitions are fishy, make it | 245 if len(edgeattrs) > 0: |
230 # display as a dotted line. | 246 out += ' [%s]' % ','.join(edgeattrs) |
231 if src == '0' and len(transitions.keys()) == 1 and transitions.has_key(0): | 247 print out |
232 edgeattrs.append('style=dashed') | 248 print '}' |
233 if len(label) > 0: | 249 return 0 |
234 edgeattrs.append('label="%s"' % EscapeDot('\n'.join(label))) | |
235 | 250 |
236 out = '%s -> %s' % (src, dst) | |
237 if len(edgeattrs) > 0: | |
238 out += ' [%s]' % ','.join(edgeattrs) | |
239 print out | |
240 print '}' | |
241 | 251 |
| 252 if __name__ == '__main__': |
| 253 sys.exit(main()) |
OLD | NEW |