| OLD | NEW |
| 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 | 4 |
| 5 #include "chrome/renderer/safe_browsing/phishing_dom_feature_extractor.h" | 5 #include "chrome/renderer/safe_browsing/phishing_dom_feature_extractor.h" |
| 6 | 6 |
| 7 #include "base/bind.h" | 7 #include "base/bind.h" |
| 8 #include "base/compiler_specific.h" | 8 #include "base/compiler_specific.h" |
| 9 #include "base/containers/hash_tables.h" | 9 #include "base/containers/hash_tables.h" |
| 10 #include "base/logging.h" | 10 #include "base/logging.h" |
| 11 #include "base/message_loop/message_loop.h" | 11 #include "base/message_loop/message_loop.h" |
| 12 #include "base/metrics/histogram.h" | 12 #include "base/metrics/histogram.h" |
| 13 #include "base/strings/string_util.h" | 13 #include "base/strings/string_util.h" |
| 14 #include "base/time/time.h" | 14 #include "base/time/time.h" |
| 15 #include "chrome/renderer/safe_browsing/feature_extractor_clock.h" | 15 #include "chrome/renderer/safe_browsing/feature_extractor_clock.h" |
| 16 #include "chrome/renderer/safe_browsing/features.h" | 16 #include "chrome/renderer/safe_browsing/features.h" |
| 17 #include "content/public/renderer/render_view.h" | 17 #include "content/public/renderer/render_view.h" |
| 18 #include "net/base/registry_controlled_domains/registry_controlled_domain.h" | 18 #include "net/base/registry_controlled_domains/registry_controlled_domain.h" |
| 19 #include "third_party/WebKit/public/platform/WebString.h" | 19 #include "third_party/WebKit/public/platform/WebString.h" |
| 20 #include "third_party/WebKit/public/web/WebElement.h" | 20 #include "third_party/WebKit/public/web/WebElement.h" |
| 21 #include "third_party/WebKit/public/web/WebElementCollection.h" |
| 21 #include "third_party/WebKit/public/web/WebFrame.h" | 22 #include "third_party/WebKit/public/web/WebFrame.h" |
| 22 #include "third_party/WebKit/public/web/WebNodeCollection.h" | |
| 23 #include "third_party/WebKit/public/web/WebView.h" | 23 #include "third_party/WebKit/public/web/WebView.h" |
| 24 | 24 |
| 25 namespace safe_browsing { | 25 namespace safe_browsing { |
| 26 | 26 |
| 27 // This time should be short enough that it doesn't noticeably disrupt the | 27 // This time should be short enough that it doesn't noticeably disrupt the |
| 28 // user's interaction with the page. | 28 // user's interaction with the page. |
| 29 const int PhishingDOMFeatureExtractor::kMaxTimePerChunkMs = 10; | 29 const int PhishingDOMFeatureExtractor::kMaxTimePerChunkMs = 10; |
| 30 | 30 |
| 31 // Experimenting shows that we get a reasonable gain in performance by | 31 // Experimenting shows that we get a reasonable gain in performance by |
| 32 // increasing this up to around 10, but there's not much benefit in | 32 // increasing this up to around 10, but there's not much benefit in |
| (...skipping 52 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 85 start_time(start_time_ticks), | 85 start_time(start_time_ticks), |
| 86 num_iterations(0) {} | 86 num_iterations(0) {} |
| 87 | 87 |
| 88 ~PageFeatureState() {} | 88 ~PageFeatureState() {} |
| 89 }; | 89 }; |
| 90 | 90 |
| 91 // Per-frame state | 91 // Per-frame state |
| 92 struct PhishingDOMFeatureExtractor::FrameData { | 92 struct PhishingDOMFeatureExtractor::FrameData { |
| 93 // This is our reference to document.all, which is an iterator over all | 93 // This is our reference to document.all, which is an iterator over all |
| 94 // of the elements in the document. It keeps track of our current position. | 94 // of the elements in the document. It keeps track of our current position. |
| 95 blink::WebNodeCollection elements; | 95 blink::WebElementCollection elements; |
| 96 // The domain of the document URL, stored here so that we don't need to | 96 // The domain of the document URL, stored here so that we don't need to |
| 97 // recompute it every time it's needed. | 97 // recompute it every time it's needed. |
| 98 std::string domain; | 98 std::string domain; |
| 99 }; | 99 }; |
| 100 | 100 |
| 101 PhishingDOMFeatureExtractor::PhishingDOMFeatureExtractor( | 101 PhishingDOMFeatureExtractor::PhishingDOMFeatureExtractor( |
| 102 content::RenderView* render_view, | 102 content::RenderView* render_view, |
| 103 FeatureExtractorClock* clock) | 103 FeatureExtractorClock* clock) |
| 104 : render_view_(render_view), | 104 : render_view_(render_view), |
| 105 clock_(clock), | 105 clock_(clock), |
| (...skipping 45 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 151 | 151 |
| 152 if (cur_document_.isNull()) { | 152 if (cur_document_.isNull()) { |
| 153 // This will only happen if we weren't able to get the document for the | 153 // This will only happen if we weren't able to get the document for the |
| 154 // main frame. We'll treat this as an extraction failure. | 154 // main frame. We'll treat this as an extraction failure. |
| 155 RunCallback(false); | 155 RunCallback(false); |
| 156 return; | 156 return; |
| 157 } | 157 } |
| 158 | 158 |
| 159 int num_elements = 0; | 159 int num_elements = 0; |
| 160 for (; !cur_document_.isNull(); cur_document_ = GetNextDocument()) { | 160 for (; !cur_document_.isNull(); cur_document_ = GetNextDocument()) { |
| 161 blink::WebNode cur_node; | 161 blink::WebElement cur_element; |
| 162 if (cur_frame_data_.get()) { | 162 if (cur_frame_data_.get()) { |
| 163 // We're resuming traversal of a frame, so just advance to the next node. | 163 // We're resuming traversal of a frame, so just advance to the next |
| 164 cur_node = cur_frame_data_->elements.nextItem(); | 164 // element. |
| 165 cur_element = cur_frame_data_->elements.nextItem(); |
| 165 // When we resume the traversal, the first call to nextItem() potentially | 166 // When we resume the traversal, the first call to nextItem() potentially |
| 166 // has to walk through the document again from the beginning, if it was | 167 // has to walk through the document again from the beginning, if it was |
| 167 // modified between our chunks of work. Log how long this takes, so we | 168 // modified between our chunks of work. Log how long this takes, so we |
| 168 // can tell if it's too slow. | 169 // can tell if it's too slow. |
| 169 UMA_HISTOGRAM_TIMES("SBClientPhishing.DOMFeatureResumeTime", | 170 UMA_HISTOGRAM_TIMES("SBClientPhishing.DOMFeatureResumeTime", |
| 170 clock_->Now() - current_chunk_start_time); | 171 clock_->Now() - current_chunk_start_time); |
| 171 } else { | 172 } else { |
| 172 // We just moved to a new frame, so update our frame state | 173 // We just moved to a new frame, so update our frame state |
| 173 // and advance to the first element. | 174 // and advance to the first element. |
| 174 ResetFrameData(); | 175 ResetFrameData(); |
| 175 cur_node = cur_frame_data_->elements.firstItem(); | 176 cur_element = cur_frame_data_->elements.firstItem(); |
| 176 } | 177 } |
| 177 | 178 |
| 178 for (; !cur_node.isNull(); | 179 for (; !cur_element.isNull(); |
| 179 cur_node = cur_frame_data_->elements.nextItem()) { | 180 cur_element = cur_frame_data_->elements.nextItem()) { |
| 180 if (!cur_node.isElementNode()) { | 181 if (cur_element.hasTagName("a")) { |
| 181 continue; | 182 HandleLink(cur_element); |
| 182 } | 183 } else if (cur_element.hasTagName("form")) { |
| 183 blink::WebElement element = cur_node.to<blink::WebElement>(); | 184 HandleForm(cur_element); |
| 184 if (element.hasTagName("a")) { | 185 } else if (cur_element.hasTagName("img")) { |
| 185 HandleLink(element); | 186 HandleImage(cur_element); |
| 186 } else if (element.hasTagName("form")) { | 187 } else if (cur_element.hasTagName("input")) { |
| 187 HandleForm(element); | 188 HandleInput(cur_element); |
| 188 } else if (element.hasTagName("img")) { | 189 } else if (cur_element.hasTagName("script")) { |
| 189 HandleImage(element); | 190 HandleScript(cur_element); |
| 190 } else if (element.hasTagName("input")) { | |
| 191 HandleInput(element); | |
| 192 } else if (element.hasTagName("script")) { | |
| 193 HandleScript(element); | |
| 194 } | 191 } |
| 195 | 192 |
| 196 if (++num_elements >= kClockCheckGranularity) { | 193 if (++num_elements >= kClockCheckGranularity) { |
| 197 num_elements = 0; | 194 num_elements = 0; |
| 198 base::TimeTicks now = clock_->Now(); | 195 base::TimeTicks now = clock_->Now(); |
| 199 if (now - page_feature_state_->start_time >= | 196 if (now - page_feature_state_->start_time >= |
| 200 base::TimeDelta::FromMilliseconds(kMaxTotalTimeMs)) { | 197 base::TimeDelta::FromMilliseconds(kMaxTotalTimeMs)) { |
| 201 DLOG(ERROR) << "Feature extraction took too long, giving up"; | 198 DLOG(ERROR) << "Feature extraction took too long, giving up"; |
| 202 // We expect this to happen infrequently, so record when it does. | 199 // We expect this to happen infrequently, so record when it does. |
| 203 UMA_HISTOGRAM_COUNTS("SBClientPhishing.DOMFeatureTimeout", 1); | 200 UMA_HISTOGRAM_COUNTS("SBClientPhishing.DOMFeatureTimeout", 1); |
| (...skipping 287 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 491 // Record number of script tags (discretized for numerical stability.) | 488 // Record number of script tags (discretized for numerical stability.) |
| 492 if (page_feature_state_->num_script_tags > 1) { | 489 if (page_feature_state_->num_script_tags > 1) { |
| 493 features_->AddBooleanFeature(features::kPageNumScriptTagsGTOne); | 490 features_->AddBooleanFeature(features::kPageNumScriptTagsGTOne); |
| 494 if (page_feature_state_->num_script_tags > 6) { | 491 if (page_feature_state_->num_script_tags > 6) { |
| 495 features_->AddBooleanFeature(features::kPageNumScriptTagsGTSix); | 492 features_->AddBooleanFeature(features::kPageNumScriptTagsGTSix); |
| 496 } | 493 } |
| 497 } | 494 } |
| 498 } | 495 } |
| 499 | 496 |
| 500 } // namespace safe_browsing | 497 } // namespace safe_browsing |
| OLD | NEW |