Index: chrome/renderer/safe_browsing/phishing_dom_feature_extractor.cc |
diff --git a/chrome/renderer/safe_browsing/phishing_dom_feature_extractor.cc b/chrome/renderer/safe_browsing/phishing_dom_feature_extractor.cc |
index 7dbf7651825409b288604c11e1f226bf959e7aa9..db17e1d9c5aecbbfcc07eb97807d973ebffff486 100644 |
--- a/chrome/renderer/safe_browsing/phishing_dom_feature_extractor.cc |
+++ b/chrome/renderer/safe_browsing/phishing_dom_feature_extractor.cc |
@@ -8,7 +8,10 @@ |
#include "base/hash_tables.h" |
#include "base/histogram.h" |
#include "base/logging.h" |
+#include "base/message_loop.h" |
+#include "base/time.h" |
#include "chrome/renderer/render_view.h" |
+#include "chrome/renderer/safe_browsing/feature_extractor_clock.h" |
#include "chrome/renderer/safe_browsing/features.h" |
#include "net/base/registry_controlled_domain.h" |
#include "third_party/WebKit/WebKit/chromium/public/WebDocument.h" |
@@ -20,6 +23,19 @@ |
namespace safe_browsing { |
+// This time should be short enough that it doesn't noticeably disrupt the |
+// user's interaction with the page. |
+const int PhishingDOMFeatureExtractor::kMaxTimePerChunkMs = 50; |
+ |
+// Experimenting shows that we get a reasonable gain in performance by |
+// increasing this up to around 10, but there's not much benefit in |
+// increasing it past that. |
+const int PhishingDOMFeatureExtractor::kClockCheckGranularity = 10; |
+ |
+// This should be longer than we expect feature extraction to take on any |
+// actual phishing page. |
+const int PhishingDOMFeatureExtractor::kMaxTotalTimeMs = 500; |
+ |
// Intermediate state used for computing features. See features.h for |
// descriptions of the DOM features that are computed. |
struct PhishingDOMFeatureExtractor::PageFeatureState { |
@@ -45,7 +61,13 @@ struct PhishingDOMFeatureExtractor::PageFeatureState { |
// How many script tags |
int num_script_tags; |
- PageFeatureState() |
+ // The time at which we started feature extraction for the current page. |
+ base::TimeTicks start_time; |
+ |
+ // The number of iterations we've done for the current extraction. |
+ int num_iterations; |
+ |
+ explicit PageFeatureState(base::TimeTicks start_time_ticks) |
: external_links(0), |
secure_links(0), |
total_links(0), |
@@ -58,7 +80,9 @@ struct PhishingDOMFeatureExtractor::PageFeatureState { |
total_actions(0), |
img_other_domain(0), |
total_imgs(0), |
- num_script_tags(0) {} |
+ num_script_tags(0), |
+ start_time(start_time_ticks), |
+ num_iterations(0) {} |
~PageFeatureState() {} |
}; |
@@ -74,8 +98,10 @@ struct PhishingDOMFeatureExtractor::FrameData { |
}; |
PhishingDOMFeatureExtractor::PhishingDOMFeatureExtractor( |
- RenderView* render_view) |
+ RenderView* render_view, |
+ FeatureExtractorClock* clock) |
: render_view_(render_view), |
+ clock_(clock), |
ALLOW_THIS_IN_INITIALIZER_LIST(method_factory_(this)) { |
Clear(); |
} |
@@ -98,6 +124,8 @@ void PhishingDOMFeatureExtractor::ExtractFeatures( |
features_ = features; |
done_callback_.reset(done_callback); |
+ |
+ page_feature_state_.reset(new PageFeatureState(clock_->Now())); |
MessageLoop::current()->PostTask( |
FROM_HERE, |
method_factory_.NewRunnableMethod( |
@@ -111,6 +139,10 @@ void PhishingDOMFeatureExtractor::CancelPendingExtraction() { |
} |
void PhishingDOMFeatureExtractor::ExtractFeaturesWithTimeout() { |
+ DCHECK(page_feature_state_.get()); |
+ ++page_feature_state_->num_iterations; |
+ base::TimeTicks current_chunk_start_time = clock_->Now(); |
+ |
if (!cur_frame_) { |
WebKit::WebView* web_view = render_view_->webview(); |
if (!web_view) { |
@@ -122,15 +154,21 @@ void PhishingDOMFeatureExtractor::ExtractFeaturesWithTimeout() { |
return; |
} |
cur_frame_ = web_view->mainFrame(); |
- page_feature_state_.reset(new PageFeatureState); |
} |
+ int num_elements = 0; |
for (; cur_frame_; |
cur_frame_ = cur_frame_->traverseNext(false /* don't wrap around */)) { |
WebKit::WebNode cur_node; |
if (cur_frame_data_.get()) { |
// We're resuming traversal of a frame, so just advance to the next node. |
cur_node = cur_frame_data_->elements.nextItem(); |
+ // When we resume the traversal, the first call to nextItem() potentially |
+ // has to walk through the document again from the beginning, if it was |
+ // modified between our chunks of work. Log how long this takes, so we |
+ // can tell if it's too slow. |
+ UMA_HISTOGRAM_TIMES("SBClientPhishing.DOMFeatureResumeTime", |
+ clock_->Now() - current_chunk_start_time); |
} else { |
// We just moved to a new frame, so update our frame state |
// and advance to the first element. |
@@ -160,8 +198,36 @@ void PhishingDOMFeatureExtractor::ExtractFeaturesWithTimeout() { |
HandleScript(element); |
} |
- // TODO(bryner): stop if too much time has elapsed, and add histograms |
- // for the time spent processing. |
+ if (++num_elements >= kClockCheckGranularity) { |
+ num_elements = 0; |
+ base::TimeTicks now = clock_->Now(); |
+ if (now - page_feature_state_->start_time >= |
+ base::TimeDelta::FromMilliseconds(kMaxTotalTimeMs)) { |
+ DLOG(ERROR) << "Feature extraction took too long, giving up"; |
+ // We expect this to happen infrequently, so record when it does. |
+ UMA_HISTOGRAM_COUNTS("SBClientPhishing.DOMFeatureTimeout", 1); |
+ RunCallback(false); |
+ return; |
+ } |
+ base::TimeDelta chunk_elapsed = now - current_chunk_start_time; |
+ if (chunk_elapsed >= |
+ base::TimeDelta::FromMilliseconds(kMaxTimePerChunkMs)) { |
+ // The time limit for the current chunk is up, so post a task to |
+ // continue extraction. |
+ // |
+ // Record how much time we actually spent on the chunk. If this is |
+ // much higher than kMaxTimePerChunkMs, we may need to adjust the |
+ // clock granularity. |
+ UMA_HISTOGRAM_TIMES("SBClientPhishing.DOMFeatureChunkTime", |
+ chunk_elapsed); |
+ MessageLoop::current()->PostTask( |
+ FROM_HERE, |
+ method_factory_.NewRunnableMethod( |
+ &PhishingDOMFeatureExtractor::ExtractFeaturesWithTimeout)); |
+ return; |
+ } |
+ // Otherwise, continue. |
+ } |
} |
// We're done with this frame, recalculate the FrameData when we |
@@ -296,6 +362,14 @@ void PhishingDOMFeatureExtractor::CheckNoPendingExtraction() { |
} |
void PhishingDOMFeatureExtractor::RunCallback(bool success) { |
+ // Record some timing stats that we can use to evaluate feature extraction |
+ // performance. These include both successful and failed extractions. |
+ DCHECK(page_feature_state_.get()); |
+ UMA_HISTOGRAM_COUNTS("SBClientPhishing.DOMFeatureIterations", |
+ page_feature_state_->num_iterations); |
+ UMA_HISTOGRAM_TIMES("SBClientPhishing.DOMFeatureTotalTime", |
+ clock_->Now() - page_feature_state_->start_time); |
+ |
DCHECK(done_callback_.get()); |
done_callback_->Run(success); |
Clear(); |