Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(1070)

Side by Side Diff: chrome/renderer/safe_browsing/phishing_dom_feature_extractor.cc

Issue 2878046: Add an extractor for DOM features to be used for client side phishing detection. (Closed)
Patch Set: address marria's comments Created 10 years, 4 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 // Copyright (c) 2010 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "chrome/renderer/safe_browsing/phishing_dom_feature_extractor.h"
6
7 #include "base/compiler_specific.h"
8 #include "base/hash_tables.h"
9 #include "base/histogram.h"
10 #include "base/logging.h"
11 #include "chrome/renderer/render_view.h"
12 #include "chrome/renderer/safe_browsing/features.h"
13 #include "net/base/registry_controlled_domain.h"
14 #include "third_party/WebKit/WebKit/chromium/public/WebDocument.h"
15 #include "third_party/WebKit/WebKit/chromium/public/WebElement.h"
16 #include "third_party/WebKit/WebKit/chromium/public/WebFrame.h"
17 #include "third_party/WebKit/WebKit/chromium/public/WebNodeCollection.h"
18 #include "third_party/WebKit/WebKit/chromium/public/WebString.h"
19 #include "third_party/WebKit/WebKit/chromium/public/WebView.h"
20
21 namespace safe_browsing {
22
23 // Intermediate state used for computing features. See features.h for
24 // descriptions of the DOM features that are computed.
25 struct PhishingDOMFeatureExtractor::PageFeatureState {
26 // Link related features
27 int external_links;
28 base::hash_set<std::string> external_domains;
29 int secure_links;
30 int total_links;
31
32 // Form related features
33 int num_forms;
34 int num_text_inputs;
35 int num_pswd_inputs;
36 int num_radio_inputs;
37 int num_check_inputs;
38 int action_other_domain;
39 int total_actions;
40
41 // Image related features
42 int img_other_domain;
43 int total_imgs;
44
45 // How many script tags
46 int num_script_tags;
47
48 PageFeatureState()
49 : external_links(0),
50 secure_links(0),
51 total_links(0),
52 num_forms(0),
53 num_text_inputs(0),
54 num_pswd_inputs(0),
55 num_radio_inputs(0),
56 num_check_inputs(0),
57 action_other_domain(0),
58 total_actions(0),
59 img_other_domain(0),
60 total_imgs(0),
61 num_script_tags(0) {}
62
63 ~PageFeatureState() {}
64 };
65
66 // Per-frame state
67 struct PhishingDOMFeatureExtractor::FrameData {
68 // This is our reference to document.all, which is an iterator over all
69 // of the elements in the document. It keeps track of our current position.
70 WebKit::WebNodeCollection elements;
71 // The domain of the document URL, stored here so that we don't need to
72 // recompute it every time it's needed.
73 std::string domain;
74 };
75
76 PhishingDOMFeatureExtractor::PhishingDOMFeatureExtractor(
77 RenderView* render_view)
78 : render_view_(render_view),
79 ALLOW_THIS_IN_INITIALIZER_LIST(method_factory_(this)) {
80 Clear();
81 }
82
83 PhishingDOMFeatureExtractor::~PhishingDOMFeatureExtractor() {
84 // The RenderView should have called CancelPendingExtraction() before
85 // we are destroyed.
86 CheckNoPendingExtraction();
87 }
88
89 void PhishingDOMFeatureExtractor::ExtractFeatures(
90 FeatureMap* features,
91 DoneCallback* done_callback) {
92 // The RenderView should have called CancelPendingExtraction() before
93 // starting a new extraction, so DCHECK this.
94 CheckNoPendingExtraction();
95 // However, in an opt build, we will go ahead and clean up the pending
96 // extraction so that we can start in a known state.
97 CancelPendingExtraction();
98
99 features_ = features;
100 done_callback_.reset(done_callback);
101 MessageLoop::current()->PostTask(
102 FROM_HERE,
103 method_factory_.NewRunnableMethod(
104 &PhishingDOMFeatureExtractor::ExtractFeaturesWithTimeout));
105 }
106
107 void PhishingDOMFeatureExtractor::CancelPendingExtraction() {
108 // Cancel any pending callbacks, and clear our state.
109 method_factory_.RevokeAll();
110 Clear();
111 }
112
113 void PhishingDOMFeatureExtractor::ExtractFeaturesWithTimeout() {
114 if (!cur_frame_) {
115 WebKit::WebView* web_view = render_view_->webview();
116 if (!web_view) {
117 // When the WebView is going away, the render view should have called
118 // CancelPendingExtraction() which should have stopped any pending work,
119 // so this case should not happen.
120 NOTREACHED();
121 RunCallback(false);
122 return;
123 }
124 cur_frame_ = web_view->mainFrame();
125 page_feature_state_.reset(new PageFeatureState);
126 }
127
128 for (; cur_frame_;
129 cur_frame_ = cur_frame_->traverseNext(false /* don't wrap around */)) {
130 WebKit::WebNode cur_node;
131 if (cur_frame_data_.get()) {
132 // We're resuming traversal of a frame, so just advance to the next node.
133 cur_node = cur_frame_data_->elements.nextItem();
134 } else {
135 // We just moved to a new frame, so update our frame state
136 // and advance to the first element.
137 if (!ResetFrameData()) {
138 // Nothing in this frame, move on to the next one.
139 LOG(WARNING) << "No content in frame, skipping";
140 continue;
141 }
142 cur_node = cur_frame_data_->elements.firstItem();
143 }
144
145 for (; !cur_node.isNull();
146 cur_node = cur_frame_data_->elements.nextItem()) {
147 if (!cur_node.isElementNode()) {
148 continue;
149 }
150 WebKit::WebElement element = cur_node.to<WebKit::WebElement>();
151 if (element.hasTagName("a")) {
152 HandleLink(element);
153 } else if (element.hasTagName("form")) {
154 HandleForm(element);
155 } else if (element.hasTagName("img")) {
156 HandleImage(element);
157 } else if (element.hasTagName("input")) {
158 HandleInput(element);
159 } else if (element.hasTagName("script")) {
160 HandleScript(element);
161 }
162
163 // TODO(bryner): stop if too much time has elapsed, and add histograms
164 // for the time spent processing.
165 }
166
167 // We're done with this frame, recalculate the FrameData when we
168 // advance to the next frame.
169 cur_frame_data_.reset();
170 }
171
172 InsertFeatures();
173 RunCallback(true);
174 }
175
176 void PhishingDOMFeatureExtractor::HandleLink(
177 const WebKit::WebElement& element) {
178 // Count the number of times we link to a different host.
179 if (!element.hasAttribute("href")) {
180 DLOG(INFO) << "Skipping anchor tag with no href";
181 return;
182 }
183
184 // Retrieve the link and resolve the link in case it's relative.
185 WebKit::WebURL full_url = element.document().completeURL(
186 element.getAttribute("href"));
187
188 std::string domain;
189 bool is_external = IsExternalDomain(full_url, &domain);
190 if (domain.empty()) {
191 LOG(ERROR) << "Could not extract domain from link: " << full_url;
192 return;
193 }
194
195 if (is_external) {
196 ++page_feature_state_->external_links;
197
198 // Record each unique domain that we link to.
199 page_feature_state_->external_domains.insert(domain);
200 }
201
202 // Check how many are https links.
203 if (GURL(full_url).SchemeIs("https")) {
204 ++page_feature_state_->secure_links;
205 }
206
207 ++page_feature_state_->total_links;
208 }
209
210 void PhishingDOMFeatureExtractor::HandleForm(
211 const WebKit::WebElement& element) {
212 // Increment the number of forms on this page.
213 ++page_feature_state_->num_forms;
214
215 // Record whether the action points to a different domain.
216 if (!element.hasAttribute("action")) {
217 return;
218 }
219
220 WebKit::WebURL full_url = element.document().completeURL(
221 element.getAttribute("action"));
222
223 std::string domain;
224 bool is_external = IsExternalDomain(full_url, &domain);
225 if (domain.empty()) {
226 LOG(ERROR) << "Could not extract domain from form action: " << full_url;
227 return;
228 }
229
230 if (is_external) {
231 ++page_feature_state_->action_other_domain;
232 }
233 ++page_feature_state_->total_actions;
234 }
235
236 void PhishingDOMFeatureExtractor::HandleImage(
237 const WebKit::WebElement& element) {
238 if (!element.hasAttribute("src")) {
239 DLOG(INFO) << "Skipping img tag with no src";
240 }
241
242 // Record whether the image points to a different domain.
243 WebKit::WebURL full_url = element.document().completeURL(
244 element.getAttribute("src"));
245 std::string domain;
246 bool is_external = IsExternalDomain(full_url, &domain);
247 if (domain.empty()) {
248 LOG(ERROR) << "Could not extract domain from image src: " << full_url;
249 return;
250 }
251
252 if (is_external) {
253 ++page_feature_state_->img_other_domain;
254 }
255 ++page_feature_state_->total_imgs;
256 }
257
258 void PhishingDOMFeatureExtractor::HandleInput(
259 const WebKit::WebElement& element) {
260 // The HTML spec says that if the type is unspecified, it defaults to text.
261 // In addition, any unrecognized type will be treated as a text input.
262 //
263 // Note that we use the attribute value rather than
264 // WebFormControlElement::formControlType() for consistency with the
265 // way the phishing classification model is created.
266 std::string type = element.getAttribute("type").utf8();
267 StringToLowerASCII(&type);
268 if (type == "password") {
269 ++page_feature_state_->num_pswd_inputs;
270 } else if (type == "radio") {
271 ++page_feature_state_->num_radio_inputs;
272 } else if (type == "checkbox") {
273 ++page_feature_state_->num_check_inputs;
274 } else if (type != "submit" && type != "reset" && type != "file" &&
275 type != "hidden" && type != "image" && type != "button") {
276 // Note that there are a number of new input types in HTML5 that are not
277 // handled above. For now, we will consider these as text inputs since
278 // they could be used to capture user input.
279 ++page_feature_state_->num_text_inputs;
280 }
281 }
282
283 void PhishingDOMFeatureExtractor::HandleScript(
284 const WebKit::WebElement& element) {
285 ++page_feature_state_->num_script_tags;
286 }
287
288 void PhishingDOMFeatureExtractor::CheckNoPendingExtraction() {
289 DCHECK(!done_callback_.get());
290 DCHECK(!cur_frame_data_.get());
291 DCHECK(!cur_frame_);
292 if (done_callback_.get() || cur_frame_data_.get() || cur_frame_) {
293 LOG(ERROR) << "Extraction in progress, missing call to "
294 << "CancelPendingExtraction";
295 }
296 }
297
298 void PhishingDOMFeatureExtractor::RunCallback(bool success) {
299 DCHECK(done_callback_.get());
300 done_callback_->Run(success);
301 Clear();
302 }
303
304 void PhishingDOMFeatureExtractor::Clear() {
305 features_ = NULL;
306 done_callback_.reset(NULL);
307 cur_frame_data_.reset(NULL);
308 cur_frame_ = NULL;
309 }
310
311 bool PhishingDOMFeatureExtractor::ResetFrameData() {
312 DCHECK(cur_frame_);
313 DCHECK(!cur_frame_data_.get());
314
315 WebKit::WebDocument doc = cur_frame_->document();
316 if (doc.isNull()) {
317 return false;
318 }
319 cur_frame_data_.reset(new FrameData());
320 cur_frame_data_->elements = doc.all();
321 cur_frame_data_->domain =
322 net::RegistryControlledDomainService::GetDomainAndRegistry(
323 cur_frame_->url());
324 return true;
325 }
326
327 bool PhishingDOMFeatureExtractor::IsExternalDomain(const GURL& url,
328 std::string* domain) const {
329 DCHECK(domain);
330 DCHECK(cur_frame_data_.get());
331
332 if (cur_frame_data_->domain.empty()) {
333 return false;
334 }
335
336 // TODO(bryner): Ensure that the url encoding is consistent with the features
337 // in the model.
338 if (url.HostIsIPAddress()) {
339 domain->assign(url.host());
340 } else {
341 domain->assign(net::RegistryControlledDomainService::GetDomainAndRegistry(
342 url));
343 }
344
345 return !domain->empty() && *domain != cur_frame_data_->domain;
346 }
347
348 void PhishingDOMFeatureExtractor::InsertFeatures() {
349 DCHECK(page_feature_state_.get());
350 features_->Clear();
351
352 if (page_feature_state_->total_links > 0) {
353 // Add a feature for the fraction of times the page links to an external
354 // domain vs. an internal domain.
355 double link_freq = static_cast<double>(
356 page_feature_state_->external_links) /
357 page_feature_state_->total_links;
358 features_->AddRealFeature(features::kPageExternalLinksFreq, link_freq);
359
360 // Add a feature for each unique domain that we're linking to
361 for (base::hash_set<std::string>::iterator it =
362 page_feature_state_->external_domains.begin();
363 it != page_feature_state_->external_domains.end(); ++it) {
364 features_->AddBooleanFeature(features::kPageLinkDomain + *it);
365 }
366
367 // Fraction of links that use https.
368 double secure_freq = static_cast<double>(
369 page_feature_state_->secure_links) / page_feature_state_->total_links;
370 features_->AddRealFeature(features::kPageSecureLinksFreq, secure_freq);
371 }
372
373 // Record whether forms appear and whether various form elements appear.
374 if (page_feature_state_->num_forms > 0) {
375 features_->AddBooleanFeature(features::kPageHasForms);
376 }
377 if (page_feature_state_->num_text_inputs > 0) {
378 features_->AddBooleanFeature(features::kPageHasTextInputs);
379 }
380 if (page_feature_state_->num_pswd_inputs > 0) {
381 features_->AddBooleanFeature(features::kPageHasPswdInputs);
382 }
383 if (page_feature_state_->num_radio_inputs > 0) {
384 features_->AddBooleanFeature(features::kPageHasRadioInputs);
385 }
386 if (page_feature_state_->num_check_inputs > 0) {
387 features_->AddBooleanFeature(features::kPageHasCheckInputs);
388 }
389
390 // Record fraction of form actions that point to a different domain.
391 if (page_feature_state_->total_actions > 0) {
392 double action_freq = static_cast<double>(
393 page_feature_state_->action_other_domain) /
394 page_feature_state_->total_actions;
395 features_->AddRealFeature(features::kPageActionOtherDomainFreq,
396 action_freq);
397 }
398
399 // Record how many image src attributes point to a different domain.
400 if (page_feature_state_->total_imgs > 0) {
401 double img_freq = static_cast<double>(
402 page_feature_state_->img_other_domain) /
403 page_feature_state_->total_imgs;
404 features_->AddRealFeature(features::kPageImgOtherDomainFreq, img_freq);
405 }
406
407 // Record number of script tags (discretized for numerical stability.)
408 if (page_feature_state_->num_script_tags > 1) {
409 features_->AddBooleanFeature(features::kPageNumScriptTagsGTOne);
410 if (page_feature_state_->num_script_tags > 6) {
411 features_->AddBooleanFeature(features::kPageNumScriptTagsGTSix);
412 }
413 }
414 }
415
416 } // namespace safe_browsing
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698