Index: tools/android/loading/content_classification_lens.py |
diff --git a/tools/android/loading/content_classification_lens.py b/tools/android/loading/content_classification_lens.py |
new file mode 100644 |
index 0000000000000000000000000000000000000000..2e8f0bdceab1a93bd44defbe80ec35023a7a8855 |
--- /dev/null |
+++ b/tools/android/loading/content_classification_lens.py |
@@ -0,0 +1,118 @@ |
+# Copyright 2016 The Chromium Authors. All rights reserved. |
+# Use of this source code is governed by a BSD-style license that can be |
+# found in the LICENSE file. |
+ |
+"""Labels requests according to the type of content they represent.""" |
+ |
+import adblockparser # Available on PyPI, through pip. |
+import collections |
+import os |
+ |
+import loading_trace |
+import request_track |
+ |
+ |
+class ContentClassificationLens(object): |
+ """Associates requests and frames with the type of content they represent.""" |
+ def __init__(self, trace, ad_rules, tracking_rules): |
+ """Initializes an instance of ContentClassificationLens. |
+ |
+ Args: |
+ trace: (LoadingTrace) loading trace. |
+ ad_rules: ([str]) List of Adblock+ compatible rules used to classify ads. |
+ tracking_rules: ([str]) List of Adblock+ compatible rules used to |
+ classify tracking and analytics. |
+ """ |
+ self._trace = trace |
+ self._requests = trace.request_track.GetEvents() |
+ self._main_frame_id = trace.page_track.GetEvents()[0]['frame_id'] |
+ self._frame_to_requests = collections.defaultdict(list) |
+ self._ad_requests = set() |
+ self._tracking_requests = set() |
+ self._ad_matcher = _RulesMatcher(ad_rules, True) |
+ self._tracking_matcher = _RulesMatcher(tracking_rules, True) |
+ self._GroupRequestsByFrameId() |
+ self._LabelRequests() |
+ |
+ def IsAdRequest(self, request): |
+ """Returns True iff the request matches one of the ad_rules.""" |
+ return request.request_id in self._ad_requests |
+ |
+ def IsTrackingRequest(self, request): |
+ """Returns True iff the request matches one of the tracking_rules.""" |
+ return request.request_id in self._tracking_requests |
+ |
+ def IsAdFrame(self, frame_id, ratio): |
+ """A Frame is an Ad frame if more than |ratio| of its requests are |
+ ad-related, and is not the main frame.""" |
+ if frame_id == self._main_frame_id: |
+ return False |
+ ad_requests_count = sum(r in self._ad_requests |
+ for r in self._frame_to_requests[frame_id]) |
+ frame_requests_count = len(self._frame_to_requests[frame_id]) |
+ return (float(ad_requests_count) / frame_requests_count) > ratio |
+ |
+ @classmethod |
+ def WithRulesFiles(cls, trace, ad_rules_filename, tracking_rules_filename): |
+ """Returns an instance of ContentClassificationLens with the rules read |
+ from files. |
+ """ |
+ ad_rules = [] |
+ tracking_rules = [] |
+ if os.path.exists(ad_rules_filename): |
+ ad_rules = open(ad_rules_filename, 'r').readlines() |
+ if os.path.exists(tracking_rules_filename): |
+ tracking_rules = open(tracking_rules_filename, 'r').readlines() |
+ return ContentClassificationLens(trace, ad_rules, tracking_rules) |
+ |
+ def _GroupRequestsByFrameId(self): |
+ for request in self._requests: |
+ frame_id = request.frame_id |
+ self._frame_to_requests[frame_id].append(request.request_id) |
+ |
+ def _LabelRequests(self): |
+ for request in self._requests: |
+ request_id = request.request_id |
+ if self._ad_matcher.Matches(request): |
+ self._ad_requests.add(request_id) |
+ if self._tracking_matcher.Matches(request): |
+ self._tracking_requests.add(request_id) |
+ |
+ |
+class _RulesMatcher(object): |
+ """Matches requests with rules in Adblock+ format.""" |
+ _WHITELIST_PREFIX = '@@' |
+ _RESOURCE_TYPE_TO_OPTIONS_KEY = { |
+ 'Script': 'script', 'Stylesheet': 'stylesheet', 'Image': 'image', |
+ 'XHR': 'xmlhttprequest'} |
+ def __init__(self, rules, no_whitelist): |
+ """Initializes an instance of _RulesMatcher. |
+ |
+ Args: |
+ rules: ([str]) list of rules. |
+ no_whitelist: (bool) Whether the whitelisting rules should be ignored. |
+ """ |
+ self._rules = self._FilterRules(rules, no_whitelist) |
+ self._matcher = adblockparser.AdblockRules(self._rules) |
+ |
+ def Matches(self, request): |
+ """Returns whether a request matches one of the rules.""" |
+ url = request.url |
+ return self._matcher.should_block(url, self._GetOptions(request)) |
+ |
+ @classmethod |
+ def _GetOptions(cls, request): |
+ options = {} |
+ resource_type = request.resource_type |
+ option = cls._RESOURCE_TYPE_TO_OPTIONS_KEY.get(resource_type) |
+ if option: |
+ options[option] = True |
+ return options |
+ |
+ @classmethod |
+ def _FilterRules(cls, rules, no_whitelist): |
+ if not no_whitelist: |
+ return rules |
+ else: |
+ return [rule for rule in rules |
+ if not rule.startswith(cls._WHITELIST_PREFIX)] |