Chromium Code Reviews| OLD | NEW |
|---|---|
| 1 # Copyright 2016 The Chromium Authors. All rights reserved. | 1 # Copyright 2016 The Chromium Authors. All rights reserved. |
| 2 # Use of this source code is governed by a BSD-style license that can be | 2 # Use of this source code is governed by a BSD-style license that can be |
| 3 # found in the LICENSE file. | 3 # found in the LICENSE file. |
| 4 | 4 |
| 5 """A collection of ResourceGraphs. | 5 """A collection of ResourceGraphs. |
| 6 | 6 |
| 7 Processes multiple ResourceGraphs, all presumably from requests to the same | 7 Processes multiple ResourceGraphs, all presumably from requests to the same |
| 8 site. Common urls are collected in Bags and different statistics on the | 8 site. Common urls are collected in Bags and different statistics on the |
| 9 relationship between bags are collected. | 9 relationship between bags are collected. |
| 10 """ | 10 """ |
| 11 | 11 |
| 12 import collections | 12 import collections |
| 13 import json | 13 import json |
| 14 import sys | 14 import sys |
| 15 import urlparse | 15 import urlparse |
| 16 | 16 |
| 17 from collections import defaultdict | 17 from collections import defaultdict |
| 18 | 18 |
| 19 import content_classification_lens | 19 import content_classification_lens |
| 20 import dag | 20 import graph |
| 21 import user_satisfied_lens | 21 import user_satisfied_lens |
| 22 | 22 |
| 23 class GraphSack(object): | 23 class GraphSack(object): |
| 24 """Aggreate of ResourceGraphs. | 24 """Aggreate of RequestDependencyGraphs. |
| 25 | 25 |
| 26 Collects ResourceGraph nodes into bags, where each bag contains the nodes with | 26 Collects RequestDependencyGraph nodes into bags, where each bag contains the |
| 27 common urls. Dependency edges are tracked between bags (so that each bag may | 27 nodes with common urls. Dependency edges are tracked between bags (so that |
| 28 be considered as a node of a graph). This graph of bags is referred to as a | 28 each bag may be considered as a node of a graph). This graph of bags is |
| 29 sack. | 29 referred to as a sack. |
| 30 | 30 |
| 31 Each bag is associated with a dag.Node, even though the bag graph may not be a | 31 Each bag is associated with a dag.Node, even though the bag graph may not be a |
| 32 DAG. The edges are annotated with list of graphs and nodes that generated | 32 DAG. The edges are annotated with list of graphs and nodes that generated |
| 33 them. | 33 them. |
| 34 """ | 34 """ |
| 35 # See CoreSet(). | 35 # See CoreSet(). |
| 36 CORE_THRESHOLD = 0.8 | 36 CORE_THRESHOLD = 0.8 |
| 37 | 37 |
| 38 _GraphInfo = collections.namedtuple('_GraphInfo', ( | 38 _GraphInfo = collections.namedtuple('_GraphInfo', ( |
| 39 'cost', # The graph cost (aka critical path length). | 39 'cost', # The graph cost (aka critical path length). |
| 40 'total_costs', # A vector by node index of total cost of each node. | |
| 41 )) | 40 )) |
| 42 | 41 |
| 43 def __init__(self): | 42 def __init__(self): |
| 44 # A bag is a node in our combined graph. | 43 # Each bag in our sack is named as indicated by this map. |
| 45 self._bags = [] | 44 self._name_to_bag = {} |
| 46 # Each bag in our sack corresponds to a url, as expressed by this map. | 45 # List our edges by bag pairs: (from_bag, to_bag) -> graph.Edge. |
| 47 self._url_to_bag = {} | 46 self._edges = {} |
| 48 # Maps graph -> _GraphInfo structures for each graph we've consumed. | 47 # Maps graph -> _GraphInfo structures for each graph we've consumed. |
| 49 self._graph_info = {} | 48 self._graph_info = {} |
| 50 | 49 |
| 51 def ConsumeGraph(self, graph): | 50 # Our graph, updated after each ConsumeGraph. |
| 51 self._graph = None | |
| 52 | |
| 53 def ConsumeGraph(self, request_graph): | |
| 52 """Add a graph and process. | 54 """Add a graph and process. |
| 53 | 55 |
| 54 Args: | 56 Args: |
| 55 graph: (ResourceGraph) the graph to add. The graph is processed sorted | 57 graph: (RequestDependencyGraph) the graph to add. The graph is processed |
| 56 according to its current filter. | 58 sorted according to its current filter. |
|
Benoit L
2016/03/29 13:45:50
nit: There are no filters anymore.
mattcary
2016/03/29 13:56:07
Done.
| |
| 57 """ | 59 """ |
| 58 assert graph not in self._graph_info | 60 assert graph not in self._graph_info |
| 59 critical_path = [] | 61 cost = request_graph.Cost() |
| 60 total_costs = [] | 62 self._graph_info[request_graph] = self._GraphInfo(cost=cost) |
| 61 cost = graph.Cost(path_list=critical_path, | 63 for n in request_graph.graph.Nodes(): |
|
Benoit L
2016/03/29 13:45:50
The nodes are no longer sorted, is it an issue? (b
mattcary
2016/03/29 13:56:07
No. The idea was that because adding nodes transit
| |
| 62 costs_out=total_costs) | 64 self.AddNode(request_graph, n) |
| 63 self._graph_info[graph] = self._GraphInfo( | |
| 64 cost=cost, total_costs=total_costs) | |
| 65 for n in graph.Nodes(sort=True): | |
| 66 assert graph._node_filter(n.Node()) | |
| 67 self.AddNode(graph, n) | |
| 68 for node in critical_path: | |
| 69 self._url_to_bag[node.Url()].MarkCritical() | |
| 70 | 65 |
| 71 def AddNode(self, graph, node): | 66 # TODO(mattcary): this is inefficient but our current API doesn't require an |
| 67 # explicit graph creation from the client. | |
| 68 self._graph = graph.DirectedGraph(self.bags, self._edges.itervalues()) | |
| 69 | |
| 70 def AddNode(self, request_graph, node): | |
| 72 """Add a node to our collection. | 71 """Add a node to our collection. |
| 73 | 72 |
| 74 Args: | 73 Args: |
| 75 graph: (ResourceGraph) the graph in which the node lives. | 74 graph: (RequestDependencyGraph) the graph in which the node lives. |
| 76 node: (NodeInfo) the node to add. | 75 node: (RequestDependencyGraph node) the node to add. |
| 77 | 76 |
| 78 Returns: | 77 Returns: |
| 79 The Bag containing the node. | 78 The Bag containing the node. |
| 80 """ | 79 """ |
| 81 if not graph._node_filter(node): | 80 sack_name = self._GetSackName(node) |
| 82 return | 81 if sack_name not in self._name_to_bag: |
| 83 if node.Url() not in self._url_to_bag: | 82 self._name_to_bag[sack_name] = Bag(self, sack_name) |
| 84 new_index = len(self._bags) | 83 bag = self._name_to_bag[sack_name] |
| 85 self._bags.append(Bag(self, new_index, node.Url())) | 84 bag.AddNode(request_graph, node) |
| 86 self._url_to_bag[node.Url()] = self._bags[-1] | 85 return bag |
| 87 self._url_to_bag[node.Url()].AddNode(graph, node) | 86 |
| 88 return self._url_to_bag[node.Url()] | 87 def AddEdge(self, from_bag, to_bag): |
| 88 """Add an edge between two bags.""" | |
| 89 if (from_bag, to_bag) not in self._edges: | |
| 90 self._edges[(from_bag, to_bag)] = graph.Edge(from_bag, to_bag) | |
| 89 | 91 |
| 90 def CoreSet(self, *graph_sets): | 92 def CoreSet(self, *graph_sets): |
| 91 """Compute the core set of this sack. | 93 """Compute the core set of this sack. |
| 92 | 94 |
| 93 The core set of a sack is the set of resource that are common to most of the | 95 The core set of a sack is the set of resource that are common to most of the |
| 94 graphs in the sack. A core set of a set of graphs are the resources that | 96 graphs in the sack. A core set of a set of graphs are the resources that |
| 95 appear with frequency at least CORE_THRESHOLD. For a collection of graph | 97 appear with frequency at least CORE_THRESHOLD. For a collection of graph |
| 96 sets, for instance pulling the same page under different network | 98 sets, for instance pulling the same page under different network |
| 97 connections, we intersect the core sets to produce a page core set that | 99 connections, we intersect the core sets to produce a page core set that |
| 98 describes the key resources used by the page. See https://goo.gl/LmqQRS for | 100 describes the key resources used by the page. See https://goo.gl/LmqQRS for |
| (...skipping 35 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 134 @property | 136 @property |
| 135 def num_graphs(self): | 137 def num_graphs(self): |
| 136 return len(self.graph_info) | 138 return len(self.graph_info) |
| 137 | 139 |
| 138 @property | 140 @property |
| 139 def graph_info(self): | 141 def graph_info(self): |
| 140 return self._graph_info | 142 return self._graph_info |
| 141 | 143 |
| 142 @property | 144 @property |
| 143 def bags(self): | 145 def bags(self): |
| 144 return self._bags | 146 return self._name_to_bag.values() |
| 145 | 147 |
| 146 def _SingleCore(self, graph_set): | 148 def _SingleCore(self, graph_set): |
| 147 core = set() | 149 core = set() |
| 148 graph_set = set(graph_set) | 150 graph_set = set(graph_set) |
| 149 num_graphs = len(graph_set) | 151 num_graphs = len(graph_set) |
| 150 for b in self.bags: | 152 for b in self.bags: |
| 151 count = sum([g in graph_set for g in b.graphs]) | 153 count = sum([g in graph_set for g in b.graphs]) |
| 152 if float(count) / num_graphs > self.CORE_THRESHOLD: | 154 if float(count) / num_graphs > self.CORE_THRESHOLD: |
| 153 core.add(b.label) | 155 core.add(b.label) |
| 154 return core | 156 return core |
| 155 | 157 |
| 158 def _GetSackName(self, node): | |
| 159 return self._MakeShortname(node.request.url) | |
| 156 | 160 |
| 157 class Bag(dag.Node): | 161 @classmethod |
| 158 def __init__(self, sack, index, url): | 162 def _MakeShortname(cls, url): |
|
Benoit L
2016/03/29 13:45:50
nit: Add a TODO(lizeb) to move this method elsewhe
mattcary
2016/03/29 13:56:07
Done.
Note it's not exactly copied... but I agree
| |
| 159 super(Bag, self).__init__(index) | 163 parsed = urlparse.urlparse(url) |
| 164 if parsed.scheme == 'data': | |
| 165 if ';' in parsed.path: | |
| 166 kind, _ = parsed.path.split(';', 1) | |
| 167 else: | |
| 168 kind, _ = parsed.path.split(',', 1) | |
| 169 return 'data:' + kind | |
| 170 path = parsed.path[:10] | |
| 171 hostname = parsed.hostname if parsed.hostname else '?.?.?' | |
| 172 return hostname + '/' + path | |
| 173 | |
| 174 | |
| 175 class Bag(graph.Node): | |
| 176 def __init__(self, sack, label): | |
| 177 super(Bag, self).__init__() | |
| 160 self._sack = sack | 178 self._sack = sack |
| 161 self._url = url | 179 self._label = label |
| 162 self._label = self._MakeShortname(url) | |
| 163 # Maps a ResourceGraph to its Nodes contained in this Bag. | 180 # Maps a ResourceGraph to its Nodes contained in this Bag. |
| 164 self._graphs = defaultdict(set) | 181 self._graphs = defaultdict(set) |
| 165 # Maps each successor bag to the set of (graph, node, graph-successor) | |
| 166 # tuples that generated it. | |
| 167 self._successor_sources = defaultdict(set) | |
| 168 # Maps each successor bag to a set of edge costs. This is just used to | |
| 169 # track min and max; if we want more statistics we'd have to count the | |
| 170 # costs with multiplicity. | |
| 171 self._successor_edge_costs = defaultdict(set) | |
| 172 | |
| 173 # Miscellaneous counts and costs used in display. | |
| 174 self._total_costs = [] | |
| 175 self._relative_costs = [] | |
| 176 self._num_critical = 0 | |
| 177 | |
| 178 @property | |
| 179 def url(self): | |
| 180 return self._url | |
| 181 | 182 |
| 182 @property | 183 @property |
| 183 def label(self): | 184 def label(self): |
| 184 return self._label | 185 return self._label |
| 185 | 186 |
| 186 @property | 187 @property |
| 187 def graphs(self): | 188 def graphs(self): |
| 188 return self._graphs | 189 return self._graphs.iterkeys() |
| 189 | |
| 190 @property | |
| 191 def successor_sources(self): | |
| 192 return self._successor_sources | |
| 193 | |
| 194 @property | |
| 195 def successor_edge_costs(self): | |
| 196 return self._successor_edge_costs | |
| 197 | |
| 198 @property | |
| 199 def total_costs(self): | |
| 200 return self._total_costs | |
| 201 | |
| 202 @property | |
| 203 def relative_costs(self): | |
| 204 return self._relative_costs | |
| 205 | |
| 206 @property | |
| 207 def num_critical(self): | |
| 208 return self._num_critical | |
| 209 | 190 |
| 210 @property | 191 @property |
| 211 def num_nodes(self): | 192 def num_nodes(self): |
| 212 return len(self._total_costs) | 193 return sum(len(g) for g in self._graphs.itervalues()) |
| 213 | 194 |
| 214 def MarkCritical(self): | 195 def AddNode(self, request_graph, node): |
| 215 self._num_critical += 1 | 196 if node in self._graphs[request_graph]: |
| 216 | |
| 217 def AddNode(self, graph, node): | |
| 218 if node in self._graphs[graph]: | |
| 219 return # Already added. | 197 return # Already added. |
| 220 graph_info = self._sack.graph_info[graph] | 198 self._graphs[request_graph].add(node) |
| 221 self._graphs[graph].add(node) | 199 for edge in request_graph.graph.OutEdges(node): |
| 222 node_total_cost = graph_info.total_costs[node.Index()] | 200 out_bag = self._sack.AddNode(request_graph, edge.to_node) |
| 223 self._total_costs.append(node_total_cost) | 201 self._sack.AddEdge(self, out_bag) |
| 224 self._relative_costs.append( | |
| 225 float(node_total_cost) / graph_info.cost) | |
| 226 for s in node.Node().Successors(): | |
| 227 if not graph._node_filter(s): | |
| 228 continue | |
| 229 node_info = graph.NodeInfo(s) | |
| 230 successor_bag = self._sack.AddNode(graph, node_info) | |
| 231 self.AddSuccessor(successor_bag) | |
| 232 self._successor_sources[successor_bag].add((graph, node, s)) | |
| 233 self._successor_edge_costs[successor_bag].add(graph.EdgeCost(node, s)) | |
| 234 | |
| 235 @classmethod | |
| 236 def _MakeShortname(cls, url): | |
| 237 parsed = urlparse.urlparse(url) | |
| 238 if parsed.scheme == 'data': | |
| 239 if ';' in parsed.path: | |
| 240 kind, _ = parsed.path.split(';', 1) | |
| 241 else: | |
| 242 kind, _ = parsed.path.split(',', 1) | |
| 243 return 'data:' + kind | |
| 244 path = parsed.path[:10] | |
| 245 hostname = parsed.hostname if parsed.hostname else '?.?.?' | |
| 246 return hostname + '/' + path | |
| OLD | NEW |