Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(2)

Unified Diff: tools/android/loading/content_classification_lens.py

Issue 1626393002: tools/android/loading: ContentClassificationLens, ads and tracking requests. (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: . Created 4 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: tools/android/loading/content_classification_lens.py
diff --git a/tools/android/loading/content_classification_lens.py b/tools/android/loading/content_classification_lens.py
new file mode 100644
index 0000000000000000000000000000000000000000..a603f007a45942a4a17a7de78d4e2f37112f708e
--- /dev/null
+++ b/tools/android/loading/content_classification_lens.py
@@ -0,0 +1,118 @@
+# Copyright 2016 The Chromium Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+"""Labels requests according to the type of content they represent."""
mattcary 2016/01/25 16:08:58 Include instructions about where to get adblockpar
Benoit L 2016/01/26 13:31:13 Done.
+
+import adblockparser
+import collections
+import os
+
+import loading_trace
+import request_track
+
+
+class ContentClassificationLens(object):
+ """Associates requests and frames with the type of content they represent."""
+ def __init__(self, trace, ad_rules, tracking_rules):
+ """Initializes an instance of ContentClassificationLens.
+
+ Args:
+ trace: (LoadingTrace) loading trace.
+ ad_rules: ([str]) List of Adblock+ compatible rules used to classify ads.
+ tracking_rules: ([str]) List of Adblock+ compatible rules used to
+ classify tracking and analytics.
+ """
+ self._trace = trace
+ self._requests = trace.request_track.GetEvents()
+ self._main_frame_id = trace.page_track.GetEvents()[0]['frame_id']
+ self._frame_to_requests = collections.defaultdict(list)
+ self._ad_requests = set()
+ self._tracking_requests = set()
+ self._ad_matcher = _RulesMatcher(ad_rules, True)
+ self._tracking_matcher = _RulesMatcher(tracking_rules, True)
+ self._GroupRequestsByFrameId()
+ self._LabelRequests()
+
+ def IsAdRequest(self, request):
+ """Returns True iff the request matches one of the ad_rules."""
+ return request.request_id in self._ad_requests
+
+ def IsTrackingRequest(self, request):
+ """Returns True iff the request matches one of the tracking_rules."""
+ return request.request_id in self._tracking_requests
+
+ def IsAdFrame(self, frame_id):
+ """A Frame is an Ad frame if more than 50% of its requests are ad-related,
+ ans it's not the main frame."""
mattcary 2016/01/25 16:08:58 ans -> and
Benoit L 2016/01/26 13:31:13 Done.
+ if frame_id == self._main_frame_id:
+ return False
+ ad_requests_count = sum(r in self._ad_requests
+ for r in self._frame_to_requests[frame_id])
+ frame_requests_count = len(self._frame_to_requests[frame_id])
+ return (float(ad_requests_count) / frame_requests_count) > .5
mattcary 2016/01/25 16:08:58 Is this 50% threshold necessary? I would think tha
Benoit L 2016/01/26 13:31:13 Changed to a configurable threshold. In practice,
+
+ @classmethod
+ def WithRulesFiles(cls, trace, ad_rules_filename, tracking_rules_filename):
+ """Returns an instance of ContentClassificationLens with the rules read
+ from files.
+ """
+ ad_rules = []
+ tracking_rules = []
+ if os.path.exists(ad_rules_filename):
+ ad_rules = open(ad_rules_filename, 'r').readlines()
+ if os.path.exists(tracking_rules_filename):
+ tracking_rules = open(tracking_rules_filename, 'r').readlines()
+ return ContentClassificationLens(trace, ad_rules, tracking_rules)
+
+ def _GroupRequestsByFrameId(self):
+ for request in self._requests:
+ frame_id = request.frame_id
+ self._frame_to_requests[frame_id].append(request.request_id)
+
+ def _LabelRequests(self):
+ for request in self._requests:
+ request_id = request.request_id
+ if self._ad_matcher.Matches(request):
+ self._ad_requests.add(request_id)
+ if self._tracking_matcher.Matches(request):
+ self._tracking_requests.add(request_id)
+
+
+class _RulesMatcher(object):
+ """Matches requests with rules in Adblock+ format."""
+ _WHITELIST_PREFIX = '@@'
+ _RESOURCE_TYPE_TO_OPTIONS_KEY = {
+ 'Script': 'script', 'Stylesheet': 'stylesheet', 'Image': 'image',
+ 'XHR': 'xmlhttprequest'}
+ def __init__(self, rules, no_whitelist):
+ """Initializes an instance of _RulesMatcher.
+
+ Args:
+ rules: ([str]) list of rules.
+ no_whitelist: (bool) Whether the whitelisting rules should be ignored.
+ """
+ self._rules = self._FilterRules(rules, no_whitelist)
+ self._matcher = adblockparser.AdblockRules(self._rules)
+
+ def Matches(self, request):
+ """Returns whether a request matches one of the rules."""
+ url = request.url
+ return self._matcher.should_block(url, self._GetOptions(request))
+
+ @classmethod
+ def _GetOptions(cls, request):
+ options = {}
+ resource_type = request.resource_type
+ option = cls._RESOURCE_TYPE_TO_OPTIONS_KEY.get(resource_type)
+ if option:
+ options[option] = True
+ return options
+
+ @classmethod
+ def _FilterRules(cls, rules, no_whitelist):
+ if not no_whitelist:
+ return rules
+ else:
+ return [rule for rule in rules
+ if not rule.startswith(cls._WHITELIST_PREFIX)]

Powered by Google App Engine
This is Rietveld 408576698