OLD | NEW |
---|---|
1 # Copyright 2015 The Chromium Authors. All rights reserved. | 1 # Copyright 2015 The Chromium Authors. All rights reserved. |
2 # Use of this source code is governed by a BSD-style license that can be | 2 # Use of this source code is governed by a BSD-style license that can be |
3 # found in the LICENSE file. | 3 # found in the LICENSE file. |
4 | 4 |
5 """Models for loading in chrome. | 5 """Models for loading in chrome. |
6 | 6 |
7 (Redirect the following to the general model module once we have one) | 7 (Redirect the following to the general model module once we have one) |
8 A model is an object with the following methods. | 8 A model is an object with the following methods. |
9 CostMs(): return the cost of the cost in milliseconds. | 9 CostMs(): return the cost of the cost in milliseconds. |
blundell
2016/01/21 09:22:53
the cost of the cost?
mattcary
2016/01/21 10:04:22
https://memegen.googleplex.com/5342822322929664
| |
10 Set(): set model-specifical parameters. | 10 Set(): set model-specifical parameters. |
blundell
2016/01/21 09:22:53
nit: specific
mattcary
2016/01/21 10:04:22
https://memegen.googleplex.com/5322689026195456
| |
11 | 11 |
12 ResourceGraph | 12 ResourceGraph |
13 This creates a DAG of resource dependancies from loading.log_requests to model | 13 This creates a DAG of resource dependancies from loading.log_requests to model |
blundell
2016/01/21 09:22:53
nit: dependencies
mattcary
2016/01/21 10:04:22
Done.
| |
14 loading time. The model may be parameterized by changing the loading time of | 14 loading time. The model may be parameterized by changing the loading time of |
15 a particular or all resources. | 15 a particular or all resources. |
16 """ | 16 """ |
17 | 17 |
18 import logging | 18 import logging |
19 import os | 19 import os |
20 import urlparse | 20 import urlparse |
21 import sys | 21 import sys |
22 | 22 |
23 import dag | 23 import dag |
24 import log_parser | |
25 | 24 |
26 class ResourceGraph(object): | 25 class ResourceGraph(object): |
27 """A model of loading by a DAG (tree?) of resource dependancies. | 26 """A model of loading by a DAG (tree?) of resource dependancies. |
28 | 27 |
29 Set parameters: | 28 Set parameters: |
30 cache_all: if true, assume zero loading time for all resources. | 29 cache_all: if true, assume zero loading time for all resources. |
31 """ | 30 """ |
31 REQUEST_LENS = None | |
Benoit L
2016/01/21 09:09:26
nit: Can you add a comment saying that this a a cl
mattcary
2016/01/21 10:04:22
Oops, I meant to remove this after you landed your
| |
32 | 32 |
33 def __init__(self, requests): | 33 def __init__(self, trace): |
34 """Create from a parsed request set. | 34 """Create from a LoadingTrace. |
35 | 35 |
36 Args: | 36 Args: |
37 requests: [RequestData, ...] filtered RequestData from loading.log_parser. | 37 trace: (LoadingTrace) Loading trace. |
38 """ | 38 """ |
39 self._BuildDag(requests) | 39 self._BuildDag(trace) |
40 self._global_start = min([n.StartTime() for n in self._node_info]) | 40 self._global_start = min([n.StartTime() for n in self._node_info]) |
41 # Sort before splitting children so that we can correctly dectect if a | 41 # Sort before splitting children so that we can correctly dectect if a |
42 # reparented child is actually a dependency for a child of its new parent. | 42 # reparented child is actually a dependency for a child of its new parent. |
43 try: | 43 try: |
44 for n in dag.TopologicalSort(self._nodes): | 44 for n in dag.TopologicalSort(self._nodes): |
45 self._SplitChildrenByTime(self._node_info[n.Index()]) | 45 self._SplitChildrenByTime(self._node_info[n.Index()]) |
46 except AssertionError as exc: | 46 except AssertionError as exc: |
47 sys.stderr.write('Bad topological sort: %s\n' | 47 sys.stderr.write('Bad topological sort: %s\n' |
48 'Skipping child split\n' % str(exc)) | 48 'Skipping child split\n' % str(exc)) |
49 self._cache_all = False | 49 self._cache_all = False |
(...skipping 125 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
175 costs[n.Index()] = cost | 175 costs[n.Index()] = cost |
176 max_cost = max(costs) | 176 max_cost = max(costs) |
177 assert max_cost > 0 # Otherwise probably the filter went awry. | 177 assert max_cost > 0 # Otherwise probably the filter went awry. |
178 if path_list is not None: | 178 if path_list is not None: |
179 del path_list[:-1] | 179 del path_list[:-1] |
180 n = (i for i in self._nodes if costs[i.Index()] == max_cost).next() | 180 n = (i for i in self._nodes if costs[i.Index()] == max_cost).next() |
181 path_list.append(self._node_info[n.Index()]) | 181 path_list.append(self._node_info[n.Index()]) |
182 while n.Predecessors(): | 182 while n.Predecessors(): |
183 n = reduce(lambda costliest, next: | 183 n = reduce(lambda costliest, next: |
184 next if (self._node_filter(next) and | 184 next if (self._node_filter(next) and |
185 cost[next.Index()] > cost[costliest.Index()]) | 185 costs[next.Index()] > costs[costliest.Index()]) |
blundell
2016/01/21 09:22:53
hmm, was this a bug?
mattcary
2016/01/21 10:04:22
Yes, I'm not sure how that got introduced, because
| |
186 else costliest, | 186 else costliest, |
187 n.Predecessors()) | 187 n.Predecessors()) |
188 path_list.insert(0, self._node_info[n.Index()]) | 188 path_list.insert(0, self._node_info[n.Index()]) |
189 return max_cost | 189 return max_cost |
190 | 190 |
191 def FilterAds(self, node): | 191 def FilterAds(self, node): |
192 """A filter for use in eg, Cost, to remove advertising nodes. | 192 """A filter for use in eg, Cost, to remove advertising nodes. |
193 | 193 |
194 Args: | 194 Args: |
195 node: A dag.Node. | 195 node: A dag.Node. |
(...skipping 119 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
315 """Create a new node info. | 315 """Create a new node info. |
316 | 316 |
317 Args: | 317 Args: |
318 node: The node to augment. | 318 node: The node to augment. |
319 request: The request associated with this node. | 319 request: The request associated with this node. |
320 """ | 320 """ |
321 self._request = request | 321 self._request = request |
322 self._node = node | 322 self._node = node |
323 self._edge_costs = {} | 323 self._edge_costs = {} |
324 self._edge_annotations = {} | 324 self._edge_annotations = {} |
325 # All fields in timing are millis relative to requestTime, which is epoch | 325 # All fields in timing are millis relative to request_time, which is epoch |
326 # seconds. | 326 # seconds. |
327 self._node_cost = max([t for f, t in request.timing._asdict().iteritems() | 327 self._node_cost = max([t for f, t in request.timing._asdict().iteritems() |
328 if f != 'requestTime']) | 328 if f != 'request_time']) |
329 | 329 |
330 def __str__(self): | 330 def __str__(self): |
331 return self.ShortName() | 331 return self.ShortName() |
332 | 332 |
333 def Node(self): | 333 def Node(self): |
334 return self._node | 334 return self._node |
335 | 335 |
336 def Index(self): | 336 def Index(self): |
337 return self._node.Index() | 337 return self._node.Index() |
338 | 338 |
339 def Request(self): | 339 def Request(self): |
340 return self._request | 340 return self._request |
341 | 341 |
342 def NodeCost(self): | 342 def NodeCost(self): |
343 return self._node_cost | 343 return self._node_cost |
344 | 344 |
345 def EdgeCost(self, s): | 345 def EdgeCost(self, s): |
346 return self._edge_costs[s] | 346 return self._edge_costs[s] |
347 | 347 |
348 def StartTime(self): | 348 def StartTime(self): |
349 return self._request.timing.requestTime * 1000 | 349 return self._request.timing.request_time * 1000 |
350 | 350 |
351 def EndTime(self): | 351 def EndTime(self): |
352 return self._request.timing.requestTime * 1000 + self._node_cost | 352 return self._request.timing.request_time * 1000 + self._node_cost |
353 | 353 |
354 def EdgeAnnotation(self, s): | 354 def EdgeAnnotation(self, s): |
355 assert s.Node() in self.Node().Successors() | 355 assert s.Node() in self.Node().Successors() |
356 return self._edge_annotations.get(s, []) | 356 return self._edge_annotations.get(s, []) |
357 | 357 |
358 def ContentType(self): | 358 def ContentType(self): |
359 return log_parser.Resource.FromRequest(self._request).GetContentType() | 359 return self._request.GetContentType() |
360 | 360 |
361 def ShortName(self): | 361 def ShortName(self): |
Benoit L
2016/01/21 09:09:26
Might be better in "Request", but I don't have a s
mattcary
2016/01/21 10:04:22
I wasn't sure what you intended either. I tweaked
Benoit L
2016/01/21 10:38:01
Yes, you're right.
| |
362 return log_parser.Resource.FromRequest(self._request).GetShortName() | 362 """Returns either the hostname of the resource, or the filename, |
363 or the end of the path. Tries to include the domain as much as possible. | |
364 """ | |
365 parsed = urlparse.urlparse(self._request.url) | |
366 path = parsed.path | |
367 if path != '' and path != '/': | |
368 last_path = parsed.path.split('/')[-1] | |
369 if len(last_path) < 10: | |
370 if len(path) < 10: | |
371 return parsed.hostname + '/' + path | |
372 else: | |
373 return parsed.hostname + '/..' + parsed.path[-10:] | |
374 elif len(last_path) > 10: | |
375 return parsed.hostname + '/..' + last_path[:5] | |
376 else: | |
377 return parsed.hostname + '/..' + last_path | |
378 else: | |
379 return parsed.hostname | |
363 | 380 |
364 def Url(self): | 381 def Url(self): |
365 return self._request.url | 382 return self._request.url |
366 | 383 |
367 def SetEdgeCost(self, child, cost): | 384 def SetEdgeCost(self, child, cost): |
368 assert child.Node() in self._node.Successors() | 385 assert child.Node() in self._node.Successors() |
369 self._edge_costs[child] = cost | 386 self._edge_costs[child] = cost |
370 | 387 |
371 def AddEdgeAnnotation(self, s, annotation): | 388 def AddEdgeAnnotation(self, s, annotation): |
372 assert s.Node() in self._node.Successors() | 389 assert s.Node() in self._node.Successors() |
(...skipping 42 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
415 def _EdgeCost(self, parent, child): | 432 def _EdgeCost(self, parent, child): |
416 """Convenience function for redirecting Nodes to _NodeInfo.""" | 433 """Convenience function for redirecting Nodes to _NodeInfo.""" |
417 return self._node_info[parent.Index()].EdgeCost( | 434 return self._node_info[parent.Index()].EdgeCost( |
418 self._node_info[child.Index()]) | 435 self._node_info[child.Index()]) |
419 | 436 |
420 def _EdgeAnnotation(self, parent, child): | 437 def _EdgeAnnotation(self, parent, child): |
421 """Convenience function for redirecting Nodes to _NodeInfo.""" | 438 """Convenience function for redirecting Nodes to _NodeInfo.""" |
422 return self._node_info[parent.Index()].EdgeAnnotation( | 439 return self._node_info[parent.Index()].EdgeAnnotation( |
423 self._node_info[child.Index()]) | 440 self._node_info[child.Index()]) |
424 | 441 |
425 def _BuildDag(self, requests): | 442 def _BuildDag(self, trace): |
426 """Build DAG of resources. | 443 """Build DAG of resources. |
427 | 444 |
428 Build a DAG from our requests and augment with _NodeInfo (see above) in a | 445 Build a DAG from our requests and augment with _NodeInfo (see above) in a |
429 parallel array indexed by Node.Index(). | 446 parallel array indexed by Node.Index(). |
430 | 447 |
431 Creates self._nodes and self._node_info. | 448 Creates self._nodes and self._node_info. |
432 | 449 |
433 Args: | 450 Args: |
434 requests: [Request, ...] Requests from loading.log_parser. | 451 trace: A LoadingTrace. |
435 """ | 452 """ |
436 self._nodes = [] | 453 self._nodes = [] |
437 self._node_info = [] | 454 self._node_info = [] |
438 indicies_by_url = {} | 455 index_by_request = {} |
Benoit L
2016/01/21 09:09:26
Since we now identify everything by the request_id
mattcary
2016/01/21 10:04:22
DAG nodes don't have a request id, so I think we h
Benoit L
2016/01/21 10:38:01
Acknowledged.
| |
439 requests_by_completion = log_parser.SortedByCompletion(requests) | 456 for request in trace.request_track.GetEvents(): |
440 for request in requests: | |
441 next_index = len(self._nodes) | 457 next_index = len(self._nodes) |
442 indicies_by_url.setdefault(request.url, []).append(next_index) | 458 assert request not in index_by_request |
459 index_by_request[request] = next_index | |
443 node = dag.Node(next_index) | 460 node = dag.Node(next_index) |
444 node_info = self._NodeInfo(node, request) | 461 node_info = self._NodeInfo(node, request) |
445 self._nodes.append(node) | 462 self._nodes.append(node) |
446 self._node_info.append(node_info) | 463 self._node_info.append(node_info) |
447 for url, indicies in indicies_by_url.iteritems(): | 464 |
448 if len(indicies) > 1: | 465 dependencies = self.REQUEST_LENS(trace).GetRequestDependencies() |
449 logging.warning('Multiple loads (%d) for url: %s' % | 466 for child_rq, parent_rq, reason in dependencies: |
450 (len(indicies), url)) | 467 parent = self._node_info[index_by_request[parent_rq]] |
451 for i in xrange(len(requests)): | 468 child = self._node_info[index_by_request[child_rq]] |
452 request = requests[i] | 469 edge_cost = child.StartTime() - parent.EndTime() |
453 current_node_info = self._node_info[i] | 470 if edge_cost < 0: |
454 resource = log_parser.Resource.FromRequest(current_node_info.Request()) | 471 edge_cost = 0 |
455 initiator = request.initiator | 472 if child.StartTime() < parent.StartTime(): |
456 initiator_type = initiator['type'] | |
457 predecessor_url = None | |
458 predecessor_type = None | |
459 # Classify & infer the predecessor. If a candidate url we identify as the | |
460 # predecessor is not in index_by_url, then we haven't seen it in our | |
461 # requests and we will try to find a better predecessor. | |
462 if initiator_type == 'parser': | |
463 url = initiator['url'] | |
464 if url in indicies_by_url: | |
465 predecessor_url = url | |
466 predecessor_type = 'parser' | |
467 elif initiator_type == 'script' and 'stackTrace' in initiator: | |
468 for frame in initiator['stackTrace']: | |
469 url = frame['url'] | |
470 if url in indicies_by_url: | |
471 predecessor_url = url | |
472 predecessor_type = 'stack' | |
473 break | |
474 elif initiator_type == 'script': | |
475 # When the initiator is a script without a stackTrace, infer that it | |
476 # comes from the most recent script from the same hostname. TLD+1 might | |
477 # be better, but finding what is a TLD requires a database. | |
478 request_hostname = urlparse.urlparse(request.url).hostname | |
479 sorted_script_requests_from_hostname = [ | |
480 r for r in requests_by_completion | |
481 if (resource.GetContentType() in ('script', 'html', 'json') | |
482 and urlparse.urlparse(r.url).hostname == request_hostname)] | |
483 most_recent = None | |
484 # Linear search is bad, but this shouldn't matter here. | |
485 for r in sorted_script_requests_from_hostname: | |
486 if r.timestamp < request.timing.requestTime: | |
487 most_recent = r | |
488 else: | |
489 break | |
490 if most_recent is not None: | |
491 url = most_recent.url | |
492 if url in indicies_by_url: | |
493 predecessor_url = url | |
494 predecessor_type = 'script_inferred' | |
495 # TODO(mattcary): we skip initiator type other, is that correct? | |
496 if predecessor_url is not None: | |
497 predecessor = self._FindBestPredecessor( | |
498 current_node_info, indicies_by_url[predecessor_url]) | |
499 edge_cost = current_node_info.StartTime() - predecessor.EndTime() | |
500 if edge_cost < 0: | |
501 edge_cost = 0 | |
502 if current_node_info.StartTime() < predecessor.StartTime(): | |
503 logging.error('Inverted dependency: %s->%s', | 473 logging.error('Inverted dependency: %s->%s', |
504 predecessor.ShortName(), current_node_info.ShortName()) | 474 parent.ShortName(), child.ShortName()) |
505 # Note that current.StartTime() < predecessor.EndTime() appears to | 475 # Note that child.StartTime() < parent.EndTime() appears to happen a |
506 # happen a fair amount in practice. | 476 # fair amount in practice. |
Benoit L
2016/01/21 09:09:26
Especially for "parser" dependencies, I suppose?
(
mattcary
2016/01/21 10:04:22
Ah, yeah, that would make sense.
| |
507 predecessor.Node().AddSuccessor(current_node_info.Node()) | 477 parent.Node().AddSuccessor(child.Node()) |
508 predecessor.SetEdgeCost(current_node_info, edge_cost) | 478 parent.SetEdgeCost(child, edge_cost) |
509 predecessor.AddEdgeAnnotation(current_node_info, predecessor_type) | 479 parent.AddEdgeAnnotation(child, reason) |
510 | |
511 def _FindBestPredecessor(self, node_info, candidate_indicies): | |
512 """Find best predecessor for node_info | |
513 | |
514 If there is only one candidate, we use it regardless of timings. We will | |
515 later warn about inverted dependencies. If there are more than one, we use | |
516 the latest whose end time is before node_info's start time. If there is no | |
517 such candidate, we throw up our hands and return an arbitrary one. | |
518 | |
519 Args: | |
520 node_info: _NodeInfo of interest. | |
521 candidate_indicies: indicies of candidate predecessors. | |
522 | |
523 Returns: | |
524 _NodeInfo of best predecessor. | |
525 """ | |
526 assert candidate_indicies | |
527 if len(candidate_indicies) == 1: | |
528 return self._node_info[candidate_indicies[0]] | |
529 candidate = self._node_info[candidate_indicies[0]] | |
530 for i in xrange(1, len(candidate_indicies)): | |
531 next_candidate = self._node_info[candidate_indicies[i]] | |
532 if (next_candidate.EndTime() < node_info.StartTime() and | |
533 next_candidate.StartTime() > candidate.StartTime()): | |
534 candidate = next_candidate | |
535 if candidate.EndTime() > node_info.StartTime(): | |
536 logging.warning('Multiple candidates but all inverted for ' + | |
537 node_info.ShortName()) | |
538 return candidate | |
539 | |
540 | 480 |
541 def _SplitChildrenByTime(self, parent): | 481 def _SplitChildrenByTime(self, parent): |
542 """Split children of a node by request times. | 482 """Split children of a node by request times. |
543 | 483 |
544 The initiator of a request may not be the true dependency of a request. For | 484 The initiator of a request may not be the true dependency of a request. For |
545 example, a script may appear to load several resources independently, but in | 485 example, a script may appear to load several resources independently, but in |
546 fact one of them may be a JSON data file, and the remaining resources assets | 486 fact one of them may be a JSON data file, and the remaining resources assets |
547 described in the JSON. The assets should be dependent upon the JSON data | 487 described in the JSON. The assets should be dependent upon the JSON data |
548 file, and not the original script. | 488 file, and not the original script. |
549 | 489 |
(...skipping 67 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
617 highlight: a list of node items to emphasize. Any resource url which | 557 highlight: a list of node items to emphasize. Any resource url which |
618 contains any highlight text will be distinguished in the output. | 558 contains any highlight text will be distinguished in the output. |
619 | 559 |
620 Returns: | 560 Returns: |
621 A string describing the resource in graphviz format. | 561 A string describing the resource in graphviz format. |
622 The resource is color-coded according to its content type, and its shape | 562 The resource is color-coded according to its content type, and its shape |
623 is oval if its max-age is less than 300s (or if it's not cacheable). | 563 is oval if its max-age is less than 300s (or if it's not cacheable). |
624 """ | 564 """ |
625 node_info = self._node_info[index] | 565 node_info = self._node_info[index] |
626 color = self._CONTENT_TYPE_TO_COLOR[node_info.ContentType()] | 566 color = self._CONTENT_TYPE_TO_COLOR[node_info.ContentType()] |
627 max_age = log_parser.MaxAge(node_info.Request()) | 567 max_age = node_info.Request().MaxAge() |
628 shape = 'polygon' if max_age > 300 else 'oval' | 568 shape = 'polygon' if max_age > 300 else 'oval' |
629 styles = ['filled'] | 569 styles = ['filled'] |
630 if highlight: | 570 if highlight: |
631 for fragment in highlight: | 571 for fragment in highlight: |
632 if fragment in node_info.Url(): | 572 if fragment in node_info.Url(): |
633 styles.append('dotted') | 573 styles.append('dotted') |
634 break | 574 break |
635 return ('%d [label = "%s\\n%.2f->%.2f (%.2f)"; style = "%s"; ' | 575 return ('%d [label = "%s\\n%.2f->%.2f (%.2f)"; style = "%s"; ' |
636 'fillcolor = %s; shape = %s];\n' | 576 'fillcolor = %s; shape = %s];\n' |
637 % (index, node_info.ShortName(), | 577 % (index, node_info.ShortName(), |
(...skipping 88 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
726 Dict of image url + short name to NodeInfo. | 666 Dict of image url + short name to NodeInfo. |
727 """ | 667 """ |
728 image_to_info = {} | 668 image_to_info = {} |
729 for n in self._node_info: | 669 for n in self._node_info: |
730 if (n.ContentType().startswith('image') and | 670 if (n.ContentType().startswith('image') and |
731 not self._IsAdUrl(n.Url())): | 671 not self._IsAdUrl(n.Url())): |
732 key = str((n.Url(), n.ShortName(), n.StartTime())) | 672 key = str((n.Url(), n.ShortName(), n.StartTime())) |
733 assert key not in image_to_info, n.Url() | 673 assert key not in image_to_info, n.Url() |
734 image_to_info[key] = n | 674 image_to_info[key] = n |
735 return image_to_info | 675 return image_to_info |
OLD | NEW |