Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(72)

Side by Side Diff: chrome/renderer/safe_browsing/phishing_dom_feature_extractor.cc

Issue 63273002: Rename WebKit namespace to blink (part 4) (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src
Patch Set: Created 7 years, 1 month ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "chrome/renderer/safe_browsing/phishing_dom_feature_extractor.h" 5 #include "chrome/renderer/safe_browsing/phishing_dom_feature_extractor.h"
6 6
7 #include "base/bind.h" 7 #include "base/bind.h"
8 #include "base/compiler_specific.h" 8 #include "base/compiler_specific.h"
9 #include "base/containers/hash_tables.h" 9 #include "base/containers/hash_tables.h"
10 #include "base/logging.h" 10 #include "base/logging.h"
(...skipping 74 matching lines...) Expand 10 before | Expand all | Expand 10 after
85 start_time(start_time_ticks), 85 start_time(start_time_ticks),
86 num_iterations(0) {} 86 num_iterations(0) {}
87 87
88 ~PageFeatureState() {} 88 ~PageFeatureState() {}
89 }; 89 };
90 90
91 // Per-frame state 91 // Per-frame state
92 struct PhishingDOMFeatureExtractor::FrameData { 92 struct PhishingDOMFeatureExtractor::FrameData {
93 // This is our reference to document.all, which is an iterator over all 93 // This is our reference to document.all, which is an iterator over all
94 // of the elements in the document. It keeps track of our current position. 94 // of the elements in the document. It keeps track of our current position.
95 WebKit::WebNodeCollection elements; 95 blink::WebNodeCollection elements;
96 // The domain of the document URL, stored here so that we don't need to 96 // The domain of the document URL, stored here so that we don't need to
97 // recompute it every time it's needed. 97 // recompute it every time it's needed.
98 std::string domain; 98 std::string domain;
99 }; 99 };
100 100
101 PhishingDOMFeatureExtractor::PhishingDOMFeatureExtractor( 101 PhishingDOMFeatureExtractor::PhishingDOMFeatureExtractor(
102 content::RenderView* render_view, 102 content::RenderView* render_view,
103 FeatureExtractorClock* clock) 103 FeatureExtractorClock* clock)
104 : render_view_(render_view), 104 : render_view_(render_view),
105 clock_(clock), 105 clock_(clock),
(...skipping 14 matching lines...) Expand all
120 // starting a new extraction, so DCHECK this. 120 // starting a new extraction, so DCHECK this.
121 CheckNoPendingExtraction(); 121 CheckNoPendingExtraction();
122 // However, in an opt build, we will go ahead and clean up the pending 122 // However, in an opt build, we will go ahead and clean up the pending
123 // extraction so that we can start in a known state. 123 // extraction so that we can start in a known state.
124 CancelPendingExtraction(); 124 CancelPendingExtraction();
125 125
126 features_ = features; 126 features_ = features;
127 done_callback_ = done_callback; 127 done_callback_ = done_callback;
128 128
129 page_feature_state_.reset(new PageFeatureState(clock_->Now())); 129 page_feature_state_.reset(new PageFeatureState(clock_->Now()));
130 WebKit::WebView* web_view = render_view_->GetWebView(); 130 blink::WebView* web_view = render_view_->GetWebView();
131 if (web_view && web_view->mainFrame()) { 131 if (web_view && web_view->mainFrame()) {
132 cur_document_ = web_view->mainFrame()->document(); 132 cur_document_ = web_view->mainFrame()->document();
133 } 133 }
134 134
135 base::MessageLoop::current()->PostTask( 135 base::MessageLoop::current()->PostTask(
136 FROM_HERE, 136 FROM_HERE,
137 base::Bind(&PhishingDOMFeatureExtractor::ExtractFeaturesWithTimeout, 137 base::Bind(&PhishingDOMFeatureExtractor::ExtractFeaturesWithTimeout,
138 weak_factory_.GetWeakPtr())); 138 weak_factory_.GetWeakPtr()));
139 } 139 }
140 140
(...skipping 10 matching lines...) Expand all
151 151
152 if (cur_document_.isNull()) { 152 if (cur_document_.isNull()) {
153 // This will only happen if we weren't able to get the document for the 153 // This will only happen if we weren't able to get the document for the
154 // main frame. We'll treat this as an extraction failure. 154 // main frame. We'll treat this as an extraction failure.
155 RunCallback(false); 155 RunCallback(false);
156 return; 156 return;
157 } 157 }
158 158
159 int num_elements = 0; 159 int num_elements = 0;
160 for (; !cur_document_.isNull(); cur_document_ = GetNextDocument()) { 160 for (; !cur_document_.isNull(); cur_document_ = GetNextDocument()) {
161 WebKit::WebNode cur_node; 161 blink::WebNode cur_node;
162 if (cur_frame_data_.get()) { 162 if (cur_frame_data_.get()) {
163 // We're resuming traversal of a frame, so just advance to the next node. 163 // We're resuming traversal of a frame, so just advance to the next node.
164 cur_node = cur_frame_data_->elements.nextItem(); 164 cur_node = cur_frame_data_->elements.nextItem();
165 // When we resume the traversal, the first call to nextItem() potentially 165 // When we resume the traversal, the first call to nextItem() potentially
166 // has to walk through the document again from the beginning, if it was 166 // has to walk through the document again from the beginning, if it was
167 // modified between our chunks of work. Log how long this takes, so we 167 // modified between our chunks of work. Log how long this takes, so we
168 // can tell if it's too slow. 168 // can tell if it's too slow.
169 UMA_HISTOGRAM_TIMES("SBClientPhishing.DOMFeatureResumeTime", 169 UMA_HISTOGRAM_TIMES("SBClientPhishing.DOMFeatureResumeTime",
170 clock_->Now() - current_chunk_start_time); 170 clock_->Now() - current_chunk_start_time);
171 } else { 171 } else {
172 // We just moved to a new frame, so update our frame state 172 // We just moved to a new frame, so update our frame state
173 // and advance to the first element. 173 // and advance to the first element.
174 ResetFrameData(); 174 ResetFrameData();
175 cur_node = cur_frame_data_->elements.firstItem(); 175 cur_node = cur_frame_data_->elements.firstItem();
176 } 176 }
177 177
178 for (; !cur_node.isNull(); 178 for (; !cur_node.isNull();
179 cur_node = cur_frame_data_->elements.nextItem()) { 179 cur_node = cur_frame_data_->elements.nextItem()) {
180 if (!cur_node.isElementNode()) { 180 if (!cur_node.isElementNode()) {
181 continue; 181 continue;
182 } 182 }
183 WebKit::WebElement element = cur_node.to<WebKit::WebElement>(); 183 blink::WebElement element = cur_node.to<blink::WebElement>();
184 if (element.hasTagName("a")) { 184 if (element.hasTagName("a")) {
185 HandleLink(element); 185 HandleLink(element);
186 } else if (element.hasTagName("form")) { 186 } else if (element.hasTagName("form")) {
187 HandleForm(element); 187 HandleForm(element);
188 } else if (element.hasTagName("img")) { 188 } else if (element.hasTagName("img")) {
189 HandleImage(element); 189 HandleImage(element);
190 } else if (element.hasTagName("input")) { 190 } else if (element.hasTagName("input")) {
191 HandleInput(element); 191 HandleInput(element);
192 } else if (element.hasTagName("script")) { 192 } else if (element.hasTagName("script")) {
193 HandleScript(element); 193 HandleScript(element);
(...skipping 35 matching lines...) Expand 10 before | Expand all | Expand 10 after
229 // We're done with this frame, recalculate the FrameData when we 229 // We're done with this frame, recalculate the FrameData when we
230 // advance to the next frame. 230 // advance to the next frame.
231 cur_frame_data_.reset(); 231 cur_frame_data_.reset();
232 } 232 }
233 233
234 InsertFeatures(); 234 InsertFeatures();
235 RunCallback(true); 235 RunCallback(true);
236 } 236 }
237 237
238 void PhishingDOMFeatureExtractor::HandleLink( 238 void PhishingDOMFeatureExtractor::HandleLink(
239 const WebKit::WebElement& element) { 239 const blink::WebElement& element) {
240 // Count the number of times we link to a different host. 240 // Count the number of times we link to a different host.
241 if (!element.hasAttribute("href")) { 241 if (!element.hasAttribute("href")) {
242 DVLOG(1) << "Skipping anchor tag with no href"; 242 DVLOG(1) << "Skipping anchor tag with no href";
243 return; 243 return;
244 } 244 }
245 245
246 // Retrieve the link and resolve the link in case it's relative. 246 // Retrieve the link and resolve the link in case it's relative.
247 WebKit::WebURL full_url = element.document().completeURL( 247 blink::WebURL full_url = element.document().completeURL(
248 element.getAttribute("href")); 248 element.getAttribute("href"));
249 249
250 std::string domain; 250 std::string domain;
251 bool is_external = IsExternalDomain(full_url, &domain); 251 bool is_external = IsExternalDomain(full_url, &domain);
252 if (domain.empty()) { 252 if (domain.empty()) {
253 DVLOG(1) << "Could not extract domain from link: " << full_url; 253 DVLOG(1) << "Could not extract domain from link: " << full_url;
254 return; 254 return;
255 } 255 }
256 256
257 if (is_external) { 257 if (is_external) {
258 ++page_feature_state_->external_links; 258 ++page_feature_state_->external_links;
259 259
260 // Record each unique domain that we link to. 260 // Record each unique domain that we link to.
261 page_feature_state_->external_domains.insert(domain); 261 page_feature_state_->external_domains.insert(domain);
262 } 262 }
263 263
264 // Check how many are https links. 264 // Check how many are https links.
265 if (GURL(full_url).SchemeIs("https")) { 265 if (GURL(full_url).SchemeIs("https")) {
266 ++page_feature_state_->secure_links; 266 ++page_feature_state_->secure_links;
267 } 267 }
268 268
269 ++page_feature_state_->total_links; 269 ++page_feature_state_->total_links;
270 } 270 }
271 271
272 void PhishingDOMFeatureExtractor::HandleForm( 272 void PhishingDOMFeatureExtractor::HandleForm(
273 const WebKit::WebElement& element) { 273 const blink::WebElement& element) {
274 // Increment the number of forms on this page. 274 // Increment the number of forms on this page.
275 ++page_feature_state_->num_forms; 275 ++page_feature_state_->num_forms;
276 276
277 // Record whether the action points to a different domain. 277 // Record whether the action points to a different domain.
278 if (!element.hasAttribute("action")) { 278 if (!element.hasAttribute("action")) {
279 return; 279 return;
280 } 280 }
281 281
282 WebKit::WebURL full_url = element.document().completeURL( 282 blink::WebURL full_url = element.document().completeURL(
283 element.getAttribute("action")); 283 element.getAttribute("action"));
284 284
285 std::string domain; 285 std::string domain;
286 bool is_external = IsExternalDomain(full_url, &domain); 286 bool is_external = IsExternalDomain(full_url, &domain);
287 if (domain.empty()) { 287 if (domain.empty()) {
288 DVLOG(1) << "Could not extract domain from form action: " << full_url; 288 DVLOG(1) << "Could not extract domain from form action: " << full_url;
289 return; 289 return;
290 } 290 }
291 291
292 if (is_external) { 292 if (is_external) {
293 ++page_feature_state_->action_other_domain; 293 ++page_feature_state_->action_other_domain;
294 } 294 }
295 ++page_feature_state_->total_actions; 295 ++page_feature_state_->total_actions;
296 } 296 }
297 297
298 void PhishingDOMFeatureExtractor::HandleImage( 298 void PhishingDOMFeatureExtractor::HandleImage(
299 const WebKit::WebElement& element) { 299 const blink::WebElement& element) {
300 if (!element.hasAttribute("src")) { 300 if (!element.hasAttribute("src")) {
301 DVLOG(1) << "Skipping img tag with no src"; 301 DVLOG(1) << "Skipping img tag with no src";
302 } 302 }
303 303
304 // Record whether the image points to a different domain. 304 // Record whether the image points to a different domain.
305 WebKit::WebURL full_url = element.document().completeURL( 305 blink::WebURL full_url = element.document().completeURL(
306 element.getAttribute("src")); 306 element.getAttribute("src"));
307 std::string domain; 307 std::string domain;
308 bool is_external = IsExternalDomain(full_url, &domain); 308 bool is_external = IsExternalDomain(full_url, &domain);
309 if (domain.empty()) { 309 if (domain.empty()) {
310 DVLOG(1) << "Could not extract domain from image src: " << full_url; 310 DVLOG(1) << "Could not extract domain from image src: " << full_url;
311 return; 311 return;
312 } 312 }
313 313
314 if (is_external) { 314 if (is_external) {
315 ++page_feature_state_->img_other_domain; 315 ++page_feature_state_->img_other_domain;
316 } 316 }
317 ++page_feature_state_->total_imgs; 317 ++page_feature_state_->total_imgs;
318 } 318 }
319 319
320 void PhishingDOMFeatureExtractor::HandleInput( 320 void PhishingDOMFeatureExtractor::HandleInput(
321 const WebKit::WebElement& element) { 321 const blink::WebElement& element) {
322 // The HTML spec says that if the type is unspecified, it defaults to text. 322 // The HTML spec says that if the type is unspecified, it defaults to text.
323 // In addition, any unrecognized type will be treated as a text input. 323 // In addition, any unrecognized type will be treated as a text input.
324 // 324 //
325 // Note that we use the attribute value rather than 325 // Note that we use the attribute value rather than
326 // WebFormControlElement::formControlType() for consistency with the 326 // WebFormControlElement::formControlType() for consistency with the
327 // way the phishing classification model is created. 327 // way the phishing classification model is created.
328 std::string type = element.getAttribute("type").utf8(); 328 std::string type = element.getAttribute("type").utf8();
329 StringToLowerASCII(&type); 329 StringToLowerASCII(&type);
330 if (type == "password") { 330 if (type == "password") {
331 ++page_feature_state_->num_pswd_inputs; 331 ++page_feature_state_->num_pswd_inputs;
332 } else if (type == "radio") { 332 } else if (type == "radio") {
333 ++page_feature_state_->num_radio_inputs; 333 ++page_feature_state_->num_radio_inputs;
334 } else if (type == "checkbox") { 334 } else if (type == "checkbox") {
335 ++page_feature_state_->num_check_inputs; 335 ++page_feature_state_->num_check_inputs;
336 } else if (type != "submit" && type != "reset" && type != "file" && 336 } else if (type != "submit" && type != "reset" && type != "file" &&
337 type != "hidden" && type != "image" && type != "button") { 337 type != "hidden" && type != "image" && type != "button") {
338 // Note that there are a number of new input types in HTML5 that are not 338 // Note that there are a number of new input types in HTML5 that are not
339 // handled above. For now, we will consider these as text inputs since 339 // handled above. For now, we will consider these as text inputs since
340 // they could be used to capture user input. 340 // they could be used to capture user input.
341 ++page_feature_state_->num_text_inputs; 341 ++page_feature_state_->num_text_inputs;
342 } 342 }
343 } 343 }
344 344
345 void PhishingDOMFeatureExtractor::HandleScript( 345 void PhishingDOMFeatureExtractor::HandleScript(
346 const WebKit::WebElement& element) { 346 const blink::WebElement& element) {
347 ++page_feature_state_->num_script_tags; 347 ++page_feature_state_->num_script_tags;
348 } 348 }
349 349
350 void PhishingDOMFeatureExtractor::CheckNoPendingExtraction() { 350 void PhishingDOMFeatureExtractor::CheckNoPendingExtraction() {
351 DCHECK(done_callback_.is_null()); 351 DCHECK(done_callback_.is_null());
352 DCHECK(!cur_frame_data_.get()); 352 DCHECK(!cur_frame_data_.get());
353 DCHECK(cur_document_.isNull()); 353 DCHECK(cur_document_.isNull());
354 if (!done_callback_.is_null() || cur_frame_data_.get() || 354 if (!done_callback_.is_null() || cur_frame_data_.get() ||
355 !cur_document_.isNull()) { 355 !cur_document_.isNull()) {
356 LOG(ERROR) << "Extraction in progress, missing call to " 356 LOG(ERROR) << "Extraction in progress, missing call to "
(...skipping 27 matching lines...) Expand all
384 DCHECK(!cur_frame_data_.get()); 384 DCHECK(!cur_frame_data_.get());
385 385
386 cur_frame_data_.reset(new FrameData()); 386 cur_frame_data_.reset(new FrameData());
387 cur_frame_data_->elements = cur_document_.all(); 387 cur_frame_data_->elements = cur_document_.all();
388 cur_frame_data_->domain = 388 cur_frame_data_->domain =
389 net::registry_controlled_domains::GetDomainAndRegistry( 389 net::registry_controlled_domains::GetDomainAndRegistry(
390 cur_document_.url(), 390 cur_document_.url(),
391 net::registry_controlled_domains::EXCLUDE_PRIVATE_REGISTRIES); 391 net::registry_controlled_domains::EXCLUDE_PRIVATE_REGISTRIES);
392 } 392 }
393 393
394 WebKit::WebDocument PhishingDOMFeatureExtractor::GetNextDocument() { 394 blink::WebDocument PhishingDOMFeatureExtractor::GetNextDocument() {
395 DCHECK(!cur_document_.isNull()); 395 DCHECK(!cur_document_.isNull());
396 WebKit::WebFrame* frame = cur_document_.frame(); 396 blink::WebFrame* frame = cur_document_.frame();
397 // Advance to the next frame that contains a document, with no wrapping. 397 // Advance to the next frame that contains a document, with no wrapping.
398 if (frame) { 398 if (frame) {
399 while ((frame = frame->traverseNext(false))) { 399 while ((frame = frame->traverseNext(false))) {
400 if (!frame->document().isNull()) { 400 if (!frame->document().isNull()) {
401 return frame->document(); 401 return frame->document();
402 } 402 }
403 } 403 }
404 } else { 404 } else {
405 // Keep track of how often frame traversal got "stuck" due to the 405 // Keep track of how often frame traversal got "stuck" due to the
406 // current subdocument getting removed from the frame tree. 406 // current subdocument getting removed from the frame tree.
407 UMA_HISTOGRAM_COUNTS("SBClientPhishing.DOMFeatureFrameRemoved", 1); 407 UMA_HISTOGRAM_COUNTS("SBClientPhishing.DOMFeatureFrameRemoved", 1);
408 } 408 }
409 return WebKit::WebDocument(); 409 return blink::WebDocument();
410 } 410 }
411 411
412 bool PhishingDOMFeatureExtractor::IsExternalDomain(const GURL& url, 412 bool PhishingDOMFeatureExtractor::IsExternalDomain(const GURL& url,
413 std::string* domain) const { 413 std::string* domain) const {
414 DCHECK(domain); 414 DCHECK(domain);
415 DCHECK(cur_frame_data_.get()); 415 DCHECK(cur_frame_data_.get());
416 416
417 if (cur_frame_data_->domain.empty()) { 417 if (cur_frame_data_->domain.empty()) {
418 return false; 418 return false;
419 } 419 }
(...skipping 71 matching lines...) Expand 10 before | Expand all | Expand 10 after
491 // Record number of script tags (discretized for numerical stability.) 491 // Record number of script tags (discretized for numerical stability.)
492 if (page_feature_state_->num_script_tags > 1) { 492 if (page_feature_state_->num_script_tags > 1) {
493 features_->AddBooleanFeature(features::kPageNumScriptTagsGTOne); 493 features_->AddBooleanFeature(features::kPageNumScriptTagsGTOne);
494 if (page_feature_state_->num_script_tags > 6) { 494 if (page_feature_state_->num_script_tags > 6) {
495 features_->AddBooleanFeature(features::kPageNumScriptTagsGTSix); 495 features_->AddBooleanFeature(features::kPageNumScriptTagsGTSix);
496 } 496 }
497 } 497 }
498 } 498 }
499 499
500 } // namespace safe_browsing 500 } // namespace safe_browsing
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698