Chromium Code Reviews| OLD | NEW |
|---|---|
| (Empty) | |
| 1 # Copyright 2016 The Chromium Authors. All rights reserved. | |
| 2 # Use of this source code is governed by a BSD-style license that can be | |
| 3 # found in the LICENSE file. | |
| 4 | |
| 5 """Labels requests according to the type of content they represent.""" | |
|
mattcary
2016/01/25 16:08:58
Include instructions about where to get adblockpar
Benoit L
2016/01/26 13:31:13
Done.
| |
| 6 | |
| 7 import adblockparser | |
| 8 import collections | |
| 9 import os | |
| 10 | |
| 11 import loading_trace | |
| 12 import request_track | |
| 13 | |
| 14 | |
| 15 class ContentClassificationLens(object): | |
| 16 """Associates requests and frames with the type of content they represent.""" | |
| 17 def __init__(self, trace, ad_rules, tracking_rules): | |
| 18 """Initializes an instance of ContentClassificationLens. | |
| 19 | |
| 20 Args: | |
| 21 trace: (LoadingTrace) loading trace. | |
| 22 ad_rules: ([str]) List of Adblock+ compatible rules used to classify ads. | |
| 23 tracking_rules: ([str]) List of Adblock+ compatible rules used to | |
| 24 classify tracking and analytics. | |
| 25 """ | |
| 26 self._trace = trace | |
| 27 self._requests = trace.request_track.GetEvents() | |
| 28 self._main_frame_id = trace.page_track.GetEvents()[0]['frame_id'] | |
| 29 self._frame_to_requests = collections.defaultdict(list) | |
| 30 self._ad_requests = set() | |
| 31 self._tracking_requests = set() | |
| 32 self._ad_matcher = _RulesMatcher(ad_rules, True) | |
| 33 self._tracking_matcher = _RulesMatcher(tracking_rules, True) | |
| 34 self._GroupRequestsByFrameId() | |
| 35 self._LabelRequests() | |
| 36 | |
| 37 def IsAdRequest(self, request): | |
| 38 """Returns True iff the request matches one of the ad_rules.""" | |
| 39 return request.request_id in self._ad_requests | |
| 40 | |
| 41 def IsTrackingRequest(self, request): | |
| 42 """Returns True iff the request matches one of the tracking_rules.""" | |
| 43 return request.request_id in self._tracking_requests | |
| 44 | |
| 45 def IsAdFrame(self, frame_id): | |
| 46 """A Frame is an Ad frame if more than 50% of its requests are ad-related, | |
| 47 ans it's not the main frame.""" | |
|
mattcary
2016/01/25 16:08:58
ans -> and
Benoit L
2016/01/26 13:31:13
Done.
| |
| 48 if frame_id == self._main_frame_id: | |
| 49 return False | |
| 50 ad_requests_count = sum(r in self._ad_requests | |
| 51 for r in self._frame_to_requests[frame_id]) | |
| 52 frame_requests_count = len(self._frame_to_requests[frame_id]) | |
| 53 return (float(ad_requests_count) / frame_requests_count) > .5 | |
|
mattcary
2016/01/25 16:08:58
Is this 50% threshold necessary? I would think tha
Benoit L
2016/01/26 13:31:13
Changed to a configurable threshold.
In practice,
| |
| 54 | |
| 55 @classmethod | |
| 56 def WithRulesFiles(cls, trace, ad_rules_filename, tracking_rules_filename): | |
| 57 """Returns an instance of ContentClassificationLens with the rules read | |
| 58 from files. | |
| 59 """ | |
| 60 ad_rules = [] | |
| 61 tracking_rules = [] | |
| 62 if os.path.exists(ad_rules_filename): | |
| 63 ad_rules = open(ad_rules_filename, 'r').readlines() | |
| 64 if os.path.exists(tracking_rules_filename): | |
| 65 tracking_rules = open(tracking_rules_filename, 'r').readlines() | |
| 66 return ContentClassificationLens(trace, ad_rules, tracking_rules) | |
| 67 | |
| 68 def _GroupRequestsByFrameId(self): | |
| 69 for request in self._requests: | |
| 70 frame_id = request.frame_id | |
| 71 self._frame_to_requests[frame_id].append(request.request_id) | |
| 72 | |
| 73 def _LabelRequests(self): | |
| 74 for request in self._requests: | |
| 75 request_id = request.request_id | |
| 76 if self._ad_matcher.Matches(request): | |
| 77 self._ad_requests.add(request_id) | |
| 78 if self._tracking_matcher.Matches(request): | |
| 79 self._tracking_requests.add(request_id) | |
| 80 | |
| 81 | |
| 82 class _RulesMatcher(object): | |
| 83 """Matches requests with rules in Adblock+ format.""" | |
| 84 _WHITELIST_PREFIX = '@@' | |
| 85 _RESOURCE_TYPE_TO_OPTIONS_KEY = { | |
| 86 'Script': 'script', 'Stylesheet': 'stylesheet', 'Image': 'image', | |
| 87 'XHR': 'xmlhttprequest'} | |
| 88 def __init__(self, rules, no_whitelist): | |
| 89 """Initializes an instance of _RulesMatcher. | |
| 90 | |
| 91 Args: | |
| 92 rules: ([str]) list of rules. | |
| 93 no_whitelist: (bool) Whether the whitelisting rules should be ignored. | |
| 94 """ | |
| 95 self._rules = self._FilterRules(rules, no_whitelist) | |
| 96 self._matcher = adblockparser.AdblockRules(self._rules) | |
| 97 | |
| 98 def Matches(self, request): | |
| 99 """Returns whether a request matches one of the rules.""" | |
| 100 url = request.url | |
| 101 return self._matcher.should_block(url, self._GetOptions(request)) | |
| 102 | |
| 103 @classmethod | |
| 104 def _GetOptions(cls, request): | |
| 105 options = {} | |
| 106 resource_type = request.resource_type | |
| 107 option = cls._RESOURCE_TYPE_TO_OPTIONS_KEY.get(resource_type) | |
| 108 if option: | |
| 109 options[option] = True | |
| 110 return options | |
| 111 | |
| 112 @classmethod | |
| 113 def _FilterRules(cls, rules, no_whitelist): | |
| 114 if not no_whitelist: | |
| 115 return rules | |
| 116 else: | |
| 117 return [rule for rule in rules | |
| 118 if not rule.startswith(cls._WHITELIST_PREFIX)] | |
| OLD | NEW |