Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(1435)

Side by Side Diff: chrome/renderer/safe_browsing/phishing_dom_feature_extractor.cc

Issue 8055018: Merge 102541 - Change PhishingDOMFeatureExtractor to cache the WebDocument rather than a WebFrame... (Closed) Base URL: svn://svn.chromium.org/chrome/branches/874/src/
Patch Set: Created 9 years, 2 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. 1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "chrome/renderer/safe_browsing/phishing_dom_feature_extractor.h" 5 #include "chrome/renderer/safe_browsing/phishing_dom_feature_extractor.h"
6 6
7 #include "base/compiler_specific.h" 7 #include "base/compiler_specific.h"
8 #include "base/hash_tables.h" 8 #include "base/hash_tables.h"
9 #include "base/logging.h" 9 #include "base/logging.h"
10 #include "base/message_loop.h" 10 #include "base/message_loop.h"
11 #include "base/metrics/histogram.h" 11 #include "base/metrics/histogram.h"
12 #include "base/string_util.h" 12 #include "base/string_util.h"
13 #include "base/time.h" 13 #include "base/time.h"
14 #include "chrome/renderer/safe_browsing/feature_extractor_clock.h" 14 #include "chrome/renderer/safe_browsing/feature_extractor_clock.h"
15 #include "chrome/renderer/safe_browsing/features.h" 15 #include "chrome/renderer/safe_browsing/features.h"
16 #include "content/renderer/render_view.h" 16 #include "content/renderer/render_view.h"
17 #include "net/base/registry_controlled_domain.h" 17 #include "net/base/registry_controlled_domain.h"
18 #include "third_party/WebKit/Source/WebKit/chromium/public/WebDocument.h"
19 #include "third_party/WebKit/Source/WebKit/chromium/public/WebElement.h" 18 #include "third_party/WebKit/Source/WebKit/chromium/public/WebElement.h"
20 #include "third_party/WebKit/Source/WebKit/chromium/public/WebFrame.h" 19 #include "third_party/WebKit/Source/WebKit/chromium/public/WebFrame.h"
21 #include "third_party/WebKit/Source/WebKit/chromium/public/WebNodeCollection.h" 20 #include "third_party/WebKit/Source/WebKit/chromium/public/WebNodeCollection.h"
22 #include "third_party/WebKit/Source/WebKit/chromium/public/WebString.h" 21 #include "third_party/WebKit/Source/WebKit/chromium/public/WebString.h"
23 #include "third_party/WebKit/Source/WebKit/chromium/public/WebView.h" 22 #include "third_party/WebKit/Source/WebKit/chromium/public/WebView.h"
24 23
25 namespace safe_browsing { 24 namespace safe_browsing {
26 25
27 // This time should be short enough that it doesn't noticeably disrupt the 26 // This time should be short enough that it doesn't noticeably disrupt the
28 // user's interaction with the page. 27 // user's interaction with the page.
(...skipping 91 matching lines...) Expand 10 before | Expand all | Expand 10 after
120 // starting a new extraction, so DCHECK this. 119 // starting a new extraction, so DCHECK this.
121 CheckNoPendingExtraction(); 120 CheckNoPendingExtraction();
122 // However, in an opt build, we will go ahead and clean up the pending 121 // However, in an opt build, we will go ahead and clean up the pending
123 // extraction so that we can start in a known state. 122 // extraction so that we can start in a known state.
124 CancelPendingExtraction(); 123 CancelPendingExtraction();
125 124
126 features_ = features; 125 features_ = features;
127 done_callback_.reset(done_callback); 126 done_callback_.reset(done_callback);
128 127
129 page_feature_state_.reset(new PageFeatureState(clock_->Now())); 128 page_feature_state_.reset(new PageFeatureState(clock_->Now()));
129 WebKit::WebView* web_view = render_view_->webview();
130 if (web_view && web_view->mainFrame()) {
131 cur_document_ = web_view->mainFrame()->document();
132 }
133
130 MessageLoop::current()->PostTask( 134 MessageLoop::current()->PostTask(
131 FROM_HERE, 135 FROM_HERE,
132 method_factory_.NewRunnableMethod( 136 method_factory_.NewRunnableMethod(
133 &PhishingDOMFeatureExtractor::ExtractFeaturesWithTimeout)); 137 &PhishingDOMFeatureExtractor::ExtractFeaturesWithTimeout));
134 } 138 }
135 139
136 void PhishingDOMFeatureExtractor::CancelPendingExtraction() { 140 void PhishingDOMFeatureExtractor::CancelPendingExtraction() {
137 // Cancel any pending callbacks, and clear our state. 141 // Cancel any pending callbacks, and clear our state.
138 method_factory_.RevokeAll(); 142 method_factory_.RevokeAll();
139 Clear(); 143 Clear();
140 } 144 }
141 145
142 void PhishingDOMFeatureExtractor::ExtractFeaturesWithTimeout() { 146 void PhishingDOMFeatureExtractor::ExtractFeaturesWithTimeout() {
143 DCHECK(page_feature_state_.get()); 147 DCHECK(page_feature_state_.get());
144 ++page_feature_state_->num_iterations; 148 ++page_feature_state_->num_iterations;
145 base::TimeTicks current_chunk_start_time = clock_->Now(); 149 base::TimeTicks current_chunk_start_time = clock_->Now();
146 150
147 if (!cur_frame_) { 151 if (cur_document_.isNull()) {
148 WebKit::WebView* web_view = render_view_->webview(); 152 // This will only happen if we weren't able to get the document for the
149 if (!web_view) { 153 // main frame. We'll treat this as an extraction failure.
150 RunCallback(false); // The WebView is going away. 154 RunCallback(false);
151 return; 155 return;
152 }
153 cur_frame_ = web_view->mainFrame();
154 } 156 }
155 157
156 int num_elements = 0; 158 int num_elements = 0;
157 for (; cur_frame_; 159 for (; !cur_document_.isNull(); cur_document_ = GetNextDocument()) {
158 cur_frame_ = cur_frame_->traverseNext(false /* don't wrap around */)) {
159 WebKit::WebNode cur_node; 160 WebKit::WebNode cur_node;
160 if (cur_frame_data_.get()) { 161 if (cur_frame_data_.get()) {
161 // We're resuming traversal of a frame, so just advance to the next node. 162 // We're resuming traversal of a frame, so just advance to the next node.
162 cur_node = cur_frame_data_->elements.nextItem(); 163 cur_node = cur_frame_data_->elements.nextItem();
163 // When we resume the traversal, the first call to nextItem() potentially 164 // When we resume the traversal, the first call to nextItem() potentially
164 // has to walk through the document again from the beginning, if it was 165 // has to walk through the document again from the beginning, if it was
165 // modified between our chunks of work. Log how long this takes, so we 166 // modified between our chunks of work. Log how long this takes, so we
166 // can tell if it's too slow. 167 // can tell if it's too slow.
167 UMA_HISTOGRAM_TIMES("SBClientPhishing.DOMFeatureResumeTime", 168 UMA_HISTOGRAM_TIMES("SBClientPhishing.DOMFeatureResumeTime",
168 clock_->Now() - current_chunk_start_time); 169 clock_->Now() - current_chunk_start_time);
(...skipping 175 matching lines...) Expand 10 before | Expand all | Expand 10 after
344 } 345 }
345 346
346 void PhishingDOMFeatureExtractor::HandleScript( 347 void PhishingDOMFeatureExtractor::HandleScript(
347 const WebKit::WebElement& element) { 348 const WebKit::WebElement& element) {
348 ++page_feature_state_->num_script_tags; 349 ++page_feature_state_->num_script_tags;
349 } 350 }
350 351
351 void PhishingDOMFeatureExtractor::CheckNoPendingExtraction() { 352 void PhishingDOMFeatureExtractor::CheckNoPendingExtraction() {
352 DCHECK(!done_callback_.get()); 353 DCHECK(!done_callback_.get());
353 DCHECK(!cur_frame_data_.get()); 354 DCHECK(!cur_frame_data_.get());
354 DCHECK(!cur_frame_); 355 DCHECK(cur_document_.isNull());
355 if (done_callback_.get() || cur_frame_data_.get() || cur_frame_) { 356 if (done_callback_.get() || cur_frame_data_.get() ||
357 !cur_document_.isNull()) {
356 LOG(ERROR) << "Extraction in progress, missing call to " 358 LOG(ERROR) << "Extraction in progress, missing call to "
357 << "CancelPendingExtraction"; 359 << "CancelPendingExtraction";
358 } 360 }
359 } 361 }
360 362
361 void PhishingDOMFeatureExtractor::RunCallback(bool success) { 363 void PhishingDOMFeatureExtractor::RunCallback(bool success) {
362 // Record some timing stats that we can use to evaluate feature extraction 364 // Record some timing stats that we can use to evaluate feature extraction
363 // performance. These include both successful and failed extractions. 365 // performance. These include both successful and failed extractions.
364 DCHECK(page_feature_state_.get()); 366 DCHECK(page_feature_state_.get());
365 UMA_HISTOGRAM_COUNTS("SBClientPhishing.DOMFeatureIterations", 367 UMA_HISTOGRAM_COUNTS("SBClientPhishing.DOMFeatureIterations",
366 page_feature_state_->num_iterations); 368 page_feature_state_->num_iterations);
367 UMA_HISTOGRAM_TIMES("SBClientPhishing.DOMFeatureTotalTime", 369 UMA_HISTOGRAM_TIMES("SBClientPhishing.DOMFeatureTotalTime",
368 clock_->Now() - page_feature_state_->start_time); 370 clock_->Now() - page_feature_state_->start_time);
369 371
370 DCHECK(done_callback_.get()); 372 DCHECK(done_callback_.get());
371 done_callback_->Run(success); 373 done_callback_->Run(success);
372 Clear(); 374 Clear();
373 } 375 }
374 376
375 void PhishingDOMFeatureExtractor::Clear() { 377 void PhishingDOMFeatureExtractor::Clear() {
376 features_ = NULL; 378 features_ = NULL;
377 done_callback_.reset(NULL); 379 done_callback_.reset(NULL);
378 cur_frame_data_.reset(NULL); 380 cur_frame_data_.reset(NULL);
379 cur_frame_ = NULL; 381 cur_document_.reset();
380 } 382 }
381 383
382 bool PhishingDOMFeatureExtractor::ResetFrameData() { 384 bool PhishingDOMFeatureExtractor::ResetFrameData() {
383 DCHECK(cur_frame_); 385 DCHECK(!cur_document_.isNull());
384 DCHECK(!cur_frame_data_.get()); 386 DCHECK(!cur_frame_data_.get());
385 387
386 WebKit::WebDocument doc = cur_frame_->document();
387 if (doc.isNull()) {
388 return false;
389 }
390 cur_frame_data_.reset(new FrameData()); 388 cur_frame_data_.reset(new FrameData());
391 cur_frame_data_->elements = doc.all(); 389 cur_frame_data_->elements = cur_document_.all();
392 cur_frame_data_->domain = 390 cur_frame_data_->domain =
393 net::RegistryControlledDomainService::GetDomainAndRegistry( 391 net::RegistryControlledDomainService::GetDomainAndRegistry(
394 cur_frame_->document().url()); 392 cur_document_.url());
395 return true; 393 return true;
396 } 394 }
397 395
396 WebKit::WebDocument PhishingDOMFeatureExtractor::GetNextDocument() {
397 DCHECK(!cur_document_.isNull());
398 WebKit::WebFrame* frame = cur_document_.frame();
399 // Advance to the next frame that contains a document, with no wrapping.
400 if (frame) {
401 while ((frame = frame->traverseNext(false))) {
402 if (!frame->document().isNull()) {
403 return frame->document();
404 }
405 }
406 } else {
407 // Keep track of how often frame traversal got "stuck" due to the
408 // current subdocument getting removed from the frame tree.
409 UMA_HISTOGRAM_COUNTS("SBClientPhishing.DOMFeatureFrameRemoved", 1);
410 }
411 return WebKit::WebDocument();
412 }
413
398 bool PhishingDOMFeatureExtractor::IsExternalDomain(const GURL& url, 414 bool PhishingDOMFeatureExtractor::IsExternalDomain(const GURL& url,
399 std::string* domain) const { 415 std::string* domain) const {
400 DCHECK(domain); 416 DCHECK(domain);
401 DCHECK(cur_frame_data_.get()); 417 DCHECK(cur_frame_data_.get());
402 418
403 if (cur_frame_data_->domain.empty()) { 419 if (cur_frame_data_->domain.empty()) {
404 return false; 420 return false;
405 } 421 }
406 422
407 // TODO(bryner): Ensure that the url encoding is consistent with the features 423 // TODO(bryner): Ensure that the url encoding is consistent with the features
(...skipping 69 matching lines...) Expand 10 before | Expand all | Expand 10 after
477 // Record number of script tags (discretized for numerical stability.) 493 // Record number of script tags (discretized for numerical stability.)
478 if (page_feature_state_->num_script_tags > 1) { 494 if (page_feature_state_->num_script_tags > 1) {
479 features_->AddBooleanFeature(features::kPageNumScriptTagsGTOne); 495 features_->AddBooleanFeature(features::kPageNumScriptTagsGTOne);
480 if (page_feature_state_->num_script_tags > 6) { 496 if (page_feature_state_->num_script_tags > 6) {
481 features_->AddBooleanFeature(features::kPageNumScriptTagsGTSix); 497 features_->AddBooleanFeature(features::kPageNumScriptTagsGTSix);
482 } 498 }
483 } 499 }
484 } 500 }
485 501
486 } // namespace safe_browsing 502 } // namespace safe_browsing
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698