OLD | NEW |
---|---|
(Empty) | |
1 # Copyright 2016 The Chromium Authors. All rights reserved. | |
2 # Use of this source code is governed by a BSD-style license that can be | |
3 # found in the LICENSE file. | |
4 | |
5 """Labels requests according to the type of content they represent.""" | |
mattcary
2016/01/25 16:08:58
Include instructions about where to get adblockpar
Benoit L
2016/01/26 13:31:13
Done.
| |
6 | |
7 import adblockparser | |
8 import collections | |
9 import os | |
10 | |
11 import loading_trace | |
12 import request_track | |
13 | |
14 | |
15 class ContentClassificationLens(object): | |
16 """Associates requests and frames with the type of content they represent.""" | |
17 def __init__(self, trace, ad_rules, tracking_rules): | |
18 """Initializes an instance of ContentClassificationLens. | |
19 | |
20 Args: | |
21 trace: (LoadingTrace) loading trace. | |
22 ad_rules: ([str]) List of Adblock+ compatible rules used to classify ads. | |
23 tracking_rules: ([str]) List of Adblock+ compatible rules used to | |
24 classify tracking and analytics. | |
25 """ | |
26 self._trace = trace | |
27 self._requests = trace.request_track.GetEvents() | |
28 self._main_frame_id = trace.page_track.GetEvents()[0]['frame_id'] | |
29 self._frame_to_requests = collections.defaultdict(list) | |
30 self._ad_requests = set() | |
31 self._tracking_requests = set() | |
32 self._ad_matcher = _RulesMatcher(ad_rules, True) | |
33 self._tracking_matcher = _RulesMatcher(tracking_rules, True) | |
34 self._GroupRequestsByFrameId() | |
35 self._LabelRequests() | |
36 | |
37 def IsAdRequest(self, request): | |
38 """Returns True iff the request matches one of the ad_rules.""" | |
39 return request.request_id in self._ad_requests | |
40 | |
41 def IsTrackingRequest(self, request): | |
42 """Returns True iff the request matches one of the tracking_rules.""" | |
43 return request.request_id in self._tracking_requests | |
44 | |
45 def IsAdFrame(self, frame_id): | |
46 """A Frame is an Ad frame if more than 50% of its requests are ad-related, | |
47 ans it's not the main frame.""" | |
mattcary
2016/01/25 16:08:58
ans -> and
Benoit L
2016/01/26 13:31:13
Done.
| |
48 if frame_id == self._main_frame_id: | |
49 return False | |
50 ad_requests_count = sum(r in self._ad_requests | |
51 for r in self._frame_to_requests[frame_id]) | |
52 frame_requests_count = len(self._frame_to_requests[frame_id]) | |
53 return (float(ad_requests_count) / frame_requests_count) > .5 | |
mattcary
2016/01/25 16:08:58
Is this 50% threshold necessary? I would think tha
Benoit L
2016/01/26 13:31:13
Changed to a configurable threshold.
In practice,
| |
54 | |
55 @classmethod | |
56 def WithRulesFiles(cls, trace, ad_rules_filename, tracking_rules_filename): | |
57 """Returns an instance of ContentClassificationLens with the rules read | |
58 from files. | |
59 """ | |
60 ad_rules = [] | |
61 tracking_rules = [] | |
62 if os.path.exists(ad_rules_filename): | |
63 ad_rules = open(ad_rules_filename, 'r').readlines() | |
64 if os.path.exists(tracking_rules_filename): | |
65 tracking_rules = open(tracking_rules_filename, 'r').readlines() | |
66 return ContentClassificationLens(trace, ad_rules, tracking_rules) | |
67 | |
68 def _GroupRequestsByFrameId(self): | |
69 for request in self._requests: | |
70 frame_id = request.frame_id | |
71 self._frame_to_requests[frame_id].append(request.request_id) | |
72 | |
73 def _LabelRequests(self): | |
74 for request in self._requests: | |
75 request_id = request.request_id | |
76 if self._ad_matcher.Matches(request): | |
77 self._ad_requests.add(request_id) | |
78 if self._tracking_matcher.Matches(request): | |
79 self._tracking_requests.add(request_id) | |
80 | |
81 | |
82 class _RulesMatcher(object): | |
83 """Matches requests with rules in Adblock+ format.""" | |
84 _WHITELIST_PREFIX = '@@' | |
85 _RESOURCE_TYPE_TO_OPTIONS_KEY = { | |
86 'Script': 'script', 'Stylesheet': 'stylesheet', 'Image': 'image', | |
87 'XHR': 'xmlhttprequest'} | |
88 def __init__(self, rules, no_whitelist): | |
89 """Initializes an instance of _RulesMatcher. | |
90 | |
91 Args: | |
92 rules: ([str]) list of rules. | |
93 no_whitelist: (bool) Whether the whitelisting rules should be ignored. | |
94 """ | |
95 self._rules = self._FilterRules(rules, no_whitelist) | |
96 self._matcher = adblockparser.AdblockRules(self._rules) | |
97 | |
98 def Matches(self, request): | |
99 """Returns whether a request matches one of the rules.""" | |
100 url = request.url | |
101 return self._matcher.should_block(url, self._GetOptions(request)) | |
102 | |
103 @classmethod | |
104 def _GetOptions(cls, request): | |
105 options = {} | |
106 resource_type = request.resource_type | |
107 option = cls._RESOURCE_TYPE_TO_OPTIONS_KEY.get(resource_type) | |
108 if option: | |
109 options[option] = True | |
110 return options | |
111 | |
112 @classmethod | |
113 def _FilterRules(cls, rules, no_whitelist): | |
114 if not no_whitelist: | |
115 return rules | |
116 else: | |
117 return [rule for rule in rules | |
118 if not rule.startswith(cls._WHITELIST_PREFIX)] | |
OLD | NEW |