OLD | NEW |
1 # Copyright 2016 The Chromium Authors. All rights reserved. | 1 # Copyright 2016 The Chromium Authors. All rights reserved. |
2 # Use of this source code is governed by a BSD-style license that can be | 2 # Use of this source code is governed by a BSD-style license that can be |
3 # found in the LICENSE file. | 3 # found in the LICENSE file. |
4 | 4 |
5 """A collection of ResourceGraphs. | 5 """A collection of ResourceGraphs. |
6 | 6 |
7 Processes multiple ResourceGraphs, all presumably from requests to the same | 7 Processes multiple ResourceGraphs, all presumably from requests to the same |
8 site. Common urls are collected in Bags and different statistics on the | 8 site. Common urls are collected in Bags and different statistics on the |
9 relationship between bags are collected. | 9 relationship between bags are collected. |
10 """ | 10 """ |
11 | 11 |
12 import collections | 12 import collections |
13 import json | 13 import json |
14 import sys | 14 import sys |
15 import urlparse | 15 import urlparse |
16 | 16 |
17 from collections import defaultdict | 17 from collections import defaultdict |
18 | 18 |
19 import content_classification_lens | 19 import content_classification_lens |
20 import dag | 20 import graph |
21 import user_satisfied_lens | 21 import user_satisfied_lens |
22 | 22 |
23 class GraphSack(object): | 23 class GraphSack(object): |
24 """Aggreate of ResourceGraphs. | 24 """Aggreate of RequestDependencyGraphs. |
25 | 25 |
26 Collects ResourceGraph nodes into bags, where each bag contains the nodes with | 26 Collects RequestDependencyGraph nodes into bags, where each bag contains the |
27 common urls. Dependency edges are tracked between bags (so that each bag may | 27 nodes with common urls. Dependency edges are tracked between bags (so that |
28 be considered as a node of a graph). This graph of bags is referred to as a | 28 each bag may be considered as a node of a graph). This graph of bags is |
29 sack. | 29 referred to as a sack. |
30 | 30 |
31 Each bag is associated with a dag.Node, even though the bag graph may not be a | 31 Each bag is associated with a dag.Node, even though the bag graph may not be a |
32 DAG. The edges are annotated with list of graphs and nodes that generated | 32 DAG. The edges are annotated with list of graphs and nodes that generated |
33 them. | 33 them. |
34 """ | 34 """ |
35 # See CoreSet(). | 35 # See CoreSet(). |
36 CORE_THRESHOLD = 0.8 | 36 CORE_THRESHOLD = 0.8 |
37 | 37 |
38 _GraphInfo = collections.namedtuple('_GraphInfo', ( | 38 _GraphInfo = collections.namedtuple('_GraphInfo', ( |
39 'cost', # The graph cost (aka critical path length). | 39 'cost', # The graph cost (aka critical path length). |
40 'total_costs', # A vector by node index of total cost of each node. | |
41 )) | 40 )) |
42 | 41 |
43 def __init__(self): | 42 def __init__(self): |
44 # A bag is a node in our combined graph. | 43 # Each bag in our sack is named as indicated by this map. |
45 self._bags = [] | 44 self._name_to_bag = {} |
46 # Each bag in our sack corresponds to a url, as expressed by this map. | 45 # List our edges by bag pairs: (from_bag, to_bag) -> graph.Edge. |
47 self._url_to_bag = {} | 46 self._edges = {} |
48 # Maps graph -> _GraphInfo structures for each graph we've consumed. | 47 # Maps graph -> _GraphInfo structures for each graph we've consumed. |
49 self._graph_info = {} | 48 self._graph_info = {} |
50 | 49 |
51 def ConsumeGraph(self, graph): | 50 # Our graph, updated after each ConsumeGraph. |
| 51 self._graph = None |
| 52 |
| 53 def ConsumeGraph(self, request_graph): |
52 """Add a graph and process. | 54 """Add a graph and process. |
53 | 55 |
54 Args: | 56 Args: |
55 graph: (ResourceGraph) the graph to add. The graph is processed sorted | 57 graph: (RequestDependencyGraph) the graph to add. |
56 according to its current filter. | |
57 """ | 58 """ |
58 assert graph not in self._graph_info | 59 assert graph not in self._graph_info |
59 critical_path = [] | 60 cost = request_graph.Cost() |
60 total_costs = [] | 61 self._graph_info[request_graph] = self._GraphInfo(cost=cost) |
61 cost = graph.Cost(path_list=critical_path, | 62 for n in request_graph.graph.Nodes(): |
62 costs_out=total_costs) | 63 self.AddNode(request_graph, n) |
63 self._graph_info[graph] = self._GraphInfo( | |
64 cost=cost, total_costs=total_costs) | |
65 for n in graph.Nodes(sort=True): | |
66 assert graph._node_filter(n.Node()) | |
67 self.AddNode(graph, n) | |
68 for node in critical_path: | |
69 self._url_to_bag[node.Url()].MarkCritical() | |
70 | 64 |
71 def AddNode(self, graph, node): | 65 # TODO(mattcary): this is inefficient but our current API doesn't require an |
| 66 # explicit graph creation from the client. |
| 67 self._graph = graph.DirectedGraph(self.bags, self._edges.itervalues()) |
| 68 |
| 69 def AddNode(self, request_graph, node): |
72 """Add a node to our collection. | 70 """Add a node to our collection. |
73 | 71 |
74 Args: | 72 Args: |
75 graph: (ResourceGraph) the graph in which the node lives. | 73 graph: (RequestDependencyGraph) the graph in which the node lives. |
76 node: (NodeInfo) the node to add. | 74 node: (RequestDependencyGraph node) the node to add. |
77 | 75 |
78 Returns: | 76 Returns: |
79 The Bag containing the node. | 77 The Bag containing the node. |
80 """ | 78 """ |
81 if not graph._node_filter(node): | 79 sack_name = self._GetSackName(node) |
82 return | 80 if sack_name not in self._name_to_bag: |
83 if node.Url() not in self._url_to_bag: | 81 self._name_to_bag[sack_name] = Bag(self, sack_name) |
84 new_index = len(self._bags) | 82 bag = self._name_to_bag[sack_name] |
85 self._bags.append(Bag(self, new_index, node.Url())) | 83 bag.AddNode(request_graph, node) |
86 self._url_to_bag[node.Url()] = self._bags[-1] | 84 return bag |
87 self._url_to_bag[node.Url()].AddNode(graph, node) | 85 |
88 return self._url_to_bag[node.Url()] | 86 def AddEdge(self, from_bag, to_bag): |
| 87 """Add an edge between two bags.""" |
| 88 if (from_bag, to_bag) not in self._edges: |
| 89 self._edges[(from_bag, to_bag)] = graph.Edge(from_bag, to_bag) |
89 | 90 |
90 def CoreSet(self, *graph_sets): | 91 def CoreSet(self, *graph_sets): |
91 """Compute the core set of this sack. | 92 """Compute the core set of this sack. |
92 | 93 |
93 The core set of a sack is the set of resource that are common to most of the | 94 The core set of a sack is the set of resource that are common to most of the |
94 graphs in the sack. A core set of a set of graphs are the resources that | 95 graphs in the sack. A core set of a set of graphs are the resources that |
95 appear with frequency at least CORE_THRESHOLD. For a collection of graph | 96 appear with frequency at least CORE_THRESHOLD. For a collection of graph |
96 sets, for instance pulling the same page under different network | 97 sets, for instance pulling the same page under different network |
97 connections, we intersect the core sets to produce a page core set that | 98 connections, we intersect the core sets to produce a page core set that |
98 describes the key resources used by the page. See https://goo.gl/LmqQRS for | 99 describes the key resources used by the page. See https://goo.gl/LmqQRS for |
(...skipping 35 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
134 @property | 135 @property |
135 def num_graphs(self): | 136 def num_graphs(self): |
136 return len(self.graph_info) | 137 return len(self.graph_info) |
137 | 138 |
138 @property | 139 @property |
139 def graph_info(self): | 140 def graph_info(self): |
140 return self._graph_info | 141 return self._graph_info |
141 | 142 |
142 @property | 143 @property |
143 def bags(self): | 144 def bags(self): |
144 return self._bags | 145 return self._name_to_bag.values() |
145 | 146 |
146 def _SingleCore(self, graph_set): | 147 def _SingleCore(self, graph_set): |
147 core = set() | 148 core = set() |
148 graph_set = set(graph_set) | 149 graph_set = set(graph_set) |
149 num_graphs = len(graph_set) | 150 num_graphs = len(graph_set) |
150 for b in self.bags: | 151 for b in self.bags: |
151 count = sum([g in graph_set for g in b.graphs]) | 152 count = sum([g in graph_set for g in b.graphs]) |
152 if float(count) / num_graphs > self.CORE_THRESHOLD: | 153 if float(count) / num_graphs > self.CORE_THRESHOLD: |
153 core.add(b.label) | 154 core.add(b.label) |
154 return core | 155 return core |
155 | 156 |
| 157 def _GetSackName(self, node): |
| 158 return self._MakeShortname(node.request.url) |
156 | 159 |
157 class Bag(dag.Node): | 160 @classmethod |
158 def __init__(self, sack, index, url): | 161 def _MakeShortname(cls, url): |
159 super(Bag, self).__init__(index) | 162 # TODO(lizeb): Move this method to a convenient common location. |
| 163 parsed = urlparse.urlparse(url) |
| 164 if parsed.scheme == 'data': |
| 165 if ';' in parsed.path: |
| 166 kind, _ = parsed.path.split(';', 1) |
| 167 else: |
| 168 kind, _ = parsed.path.split(',', 1) |
| 169 return 'data:' + kind |
| 170 path = parsed.path[:10] |
| 171 hostname = parsed.hostname if parsed.hostname else '?.?.?' |
| 172 return hostname + '/' + path |
| 173 |
| 174 |
| 175 class Bag(graph.Node): |
| 176 def __init__(self, sack, label): |
| 177 super(Bag, self).__init__() |
160 self._sack = sack | 178 self._sack = sack |
161 self._url = url | 179 self._label = label |
162 self._label = self._MakeShortname(url) | |
163 # Maps a ResourceGraph to its Nodes contained in this Bag. | 180 # Maps a ResourceGraph to its Nodes contained in this Bag. |
164 self._graphs = defaultdict(set) | 181 self._graphs = defaultdict(set) |
165 # Maps each successor bag to the set of (graph, node, graph-successor) | |
166 # tuples that generated it. | |
167 self._successor_sources = defaultdict(set) | |
168 # Maps each successor bag to a set of edge costs. This is just used to | |
169 # track min and max; if we want more statistics we'd have to count the | |
170 # costs with multiplicity. | |
171 self._successor_edge_costs = defaultdict(set) | |
172 | |
173 # Miscellaneous counts and costs used in display. | |
174 self._total_costs = [] | |
175 self._relative_costs = [] | |
176 self._num_critical = 0 | |
177 | |
178 @property | |
179 def url(self): | |
180 return self._url | |
181 | 182 |
182 @property | 183 @property |
183 def label(self): | 184 def label(self): |
184 return self._label | 185 return self._label |
185 | 186 |
186 @property | 187 @property |
187 def graphs(self): | 188 def graphs(self): |
188 return self._graphs | 189 return self._graphs.iterkeys() |
189 | |
190 @property | |
191 def successor_sources(self): | |
192 return self._successor_sources | |
193 | |
194 @property | |
195 def successor_edge_costs(self): | |
196 return self._successor_edge_costs | |
197 | |
198 @property | |
199 def total_costs(self): | |
200 return self._total_costs | |
201 | |
202 @property | |
203 def relative_costs(self): | |
204 return self._relative_costs | |
205 | |
206 @property | |
207 def num_critical(self): | |
208 return self._num_critical | |
209 | 190 |
210 @property | 191 @property |
211 def num_nodes(self): | 192 def num_nodes(self): |
212 return len(self._total_costs) | 193 return sum(len(g) for g in self._graphs.itervalues()) |
213 | 194 |
214 def MarkCritical(self): | 195 def AddNode(self, request_graph, node): |
215 self._num_critical += 1 | 196 if node in self._graphs[request_graph]: |
216 | |
217 def AddNode(self, graph, node): | |
218 if node in self._graphs[graph]: | |
219 return # Already added. | 197 return # Already added. |
220 graph_info = self._sack.graph_info[graph] | 198 self._graphs[request_graph].add(node) |
221 self._graphs[graph].add(node) | 199 for edge in request_graph.graph.OutEdges(node): |
222 node_total_cost = graph_info.total_costs[node.Index()] | 200 out_bag = self._sack.AddNode(request_graph, edge.to_node) |
223 self._total_costs.append(node_total_cost) | 201 self._sack.AddEdge(self, out_bag) |
224 self._relative_costs.append( | |
225 float(node_total_cost) / graph_info.cost) | |
226 for s in node.Node().Successors(): | |
227 if not graph._node_filter(s): | |
228 continue | |
229 node_info = graph.NodeInfo(s) | |
230 successor_bag = self._sack.AddNode(graph, node_info) | |
231 self.AddSuccessor(successor_bag) | |
232 self._successor_sources[successor_bag].add((graph, node, s)) | |
233 self._successor_edge_costs[successor_bag].add(graph.EdgeCost(node, s)) | |
234 | |
235 @classmethod | |
236 def _MakeShortname(cls, url): | |
237 parsed = urlparse.urlparse(url) | |
238 if parsed.scheme == 'data': | |
239 if ';' in parsed.path: | |
240 kind, _ = parsed.path.split(';', 1) | |
241 else: | |
242 kind, _ = parsed.path.split(',', 1) | |
243 return 'data:' + kind | |
244 path = parsed.path[:10] | |
245 hostname = parsed.hostname if parsed.hostname else '?.?.?' | |
246 return hostname + '/' + path | |
OLD | NEW |