tools/android/loading/content_classification_lens.py - Issue 1626393002: tools/android/loading: ContentClassificationLens, ads and tracking requests.

Unified Diff: tools/android/loading/content_classification_lens.py

Issue 1626393002: tools/android/loading: ContentClassificationLens, ads and tracking requests. (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: . Created 4 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

« no previous file with comments | « tools/android/loading/analyze.py ('k') | tools/android/loading/content_classification_lens_unittest.py » ('j') | tools/android/loading/install-deps.sh » ('J')
Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

Index: tools/android/loading/content_classification_lens.py

diff --git a/tools/android/loading/content_classification_lens.py b/tools/android/loading/content_classification_lens.py

new file mode 100644

index 0000000000000000000000000000000000000000..a603f007a45942a4a17a7de78d4e2f37112f708e

--- /dev/null

+++ b/tools/android/loading/content_classification_lens.py

@@ -0,0 +1,118 @@

+# Use of this source code is governed by a BSD-style license that can be

+# found in the LICENSE file.

+"""Labels requests according to the type of content they represent."""

mattcary 2016/01/25 16:08:58 Include instructions about where to get adblockpar

Benoit L 2016/01/26 13:31:13 Done.

+import adblockparser

+import collections

+import os

+import loading_trace

+import request_track

+class ContentClassificationLens(object):

+ """Associates requests and frames with the type of content they represent."""

+ def __init__(self, trace, ad_rules, tracking_rules):

+ """Initializes an instance of ContentClassificationLens.

+ Args:

+ trace: (LoadingTrace) loading trace.

+ ad_rules: ([str]) List of Adblock+ compatible rules used to classify ads.

+ tracking_rules: ([str]) List of Adblock+ compatible rules used to

+ classify tracking and analytics.

+ """

+ self._trace = trace

+ self._requests = trace.request_track.GetEvents()

+ self._main_frame_id = trace.page_track.GetEvents()[0]['frame_id']

+ self._frame_to_requests = collections.defaultdict(list)

+ self._ad_requests = set()

+ self._tracking_requests = set()

+ self._ad_matcher = _RulesMatcher(ad_rules, True)

+ self._tracking_matcher = _RulesMatcher(tracking_rules, True)

+ self._GroupRequestsByFrameId()

+ self._LabelRequests()

+ def IsAdRequest(self, request):

+ """Returns True iff the request matches one of the ad_rules."""

+ return request.request_id in self._ad_requests

+ def IsTrackingRequest(self, request):

+ """Returns True iff the request matches one of the tracking_rules."""

+ return request.request_id in self._tracking_requests

+ def IsAdFrame(self, frame_id):

+ """A Frame is an Ad frame if more than 50% of its requests are ad-related,

+ ans it's not the main frame."""

mattcary 2016/01/25 16:08:58 ans -> and

Benoit L 2016/01/26 13:31:13 Done.

+ if frame_id == self._main_frame_id:

+ return False

+ ad_requests_count = sum(r in self._ad_requests

+ for r in self._frame_to_requests[frame_id])

+ frame_requests_count = len(self._frame_to_requests[frame_id])

+ return (float(ad_requests_count) / frame_requests_count) > .5

mattcary 2016/01/25 16:08:58 Is this 50% threshold necessary? I would think tha

Benoit L 2016/01/26 13:31:13 Changed to a configurable threshold. In practice,

+ @classmethod

+ def WithRulesFiles(cls, trace, ad_rules_filename, tracking_rules_filename):

+ """Returns an instance of ContentClassificationLens with the rules read

+ from files.

+ """

+ ad_rules = []

+ tracking_rules = []

+ if os.path.exists(ad_rules_filename):

+ ad_rules = open(ad_rules_filename, 'r').readlines()

+ if os.path.exists(tracking_rules_filename):

+ tracking_rules = open(tracking_rules_filename, 'r').readlines()

+ return ContentClassificationLens(trace, ad_rules, tracking_rules)

+ def _GroupRequestsByFrameId(self):

+ for request in self._requests:

+ frame_id = request.frame_id

+ self._frame_to_requests[frame_id].append(request.request_id)

+ def _LabelRequests(self):

+ for request in self._requests:

+ request_id = request.request_id

+ if self._ad_matcher.Matches(request):

+ self._ad_requests.add(request_id)

+ if self._tracking_matcher.Matches(request):

+ self._tracking_requests.add(request_id)

+class _RulesMatcher(object):

+ """Matches requests with rules in Adblock+ format."""

+ _WHITELIST_PREFIX = '@@'

+ _RESOURCE_TYPE_TO_OPTIONS_KEY = {

+ 'Script': 'script', 'Stylesheet': 'stylesheet', 'Image': 'image',

+ 'XHR': 'xmlhttprequest'}

+ def __init__(self, rules, no_whitelist):

+ """Initializes an instance of _RulesMatcher.

+ Args:

+ rules: ([str]) list of rules.

+ no_whitelist: (bool) Whether the whitelisting rules should be ignored.

+ """

+ self._rules = self._FilterRules(rules, no_whitelist)

+ self._matcher = adblockparser.AdblockRules(self._rules)

+ def Matches(self, request):

+ """Returns whether a request matches one of the rules."""

+ url = request.url

+ return self._matcher.should_block(url, self._GetOptions(request))

+ @classmethod

+ def _GetOptions(cls, request):

+ options = {}

+ resource_type = request.resource_type

+ option = cls._RESOURCE_TYPE_TO_OPTIONS_KEY.get(resource_type)

+ if option:

+ options[option] = True

+ return options

+ @classmethod

+ def _FilterRules(cls, rules, no_whitelist):

+ if not no_whitelist:

+ return rules

+ else:

+ return [rule for rule in rules

+ if not rule.startswith(cls._WHITELIST_PREFIX)]

« no previous file with comments | « tools/android/loading/analyze.py ('k') | tools/android/loading/content_classification_lens_unittest.py » ('j') | tools/android/loading/install-deps.sh » ('J')