OLD | NEW |
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 #include "chrome/renderer/safe_browsing/phishing_dom_feature_extractor.h" | 5 #include "chrome/renderer/safe_browsing/phishing_dom_feature_extractor.h" |
6 | 6 |
7 #include "base/bind.h" | 7 #include "base/bind.h" |
8 #include "base/compiler_specific.h" | 8 #include "base/compiler_specific.h" |
9 #include "base/containers/hash_tables.h" | 9 #include "base/containers/hash_tables.h" |
10 #include "base/logging.h" | 10 #include "base/logging.h" |
11 #include "base/message_loop/message_loop.h" | 11 #include "base/message_loop/message_loop.h" |
12 #include "base/metrics/histogram.h" | 12 #include "base/metrics/histogram.h" |
13 #include "base/strings/string_util.h" | 13 #include "base/strings/string_util.h" |
14 #include "base/time/time.h" | 14 #include "base/time/time.h" |
15 #include "chrome/renderer/safe_browsing/feature_extractor_clock.h" | 15 #include "chrome/renderer/safe_browsing/feature_extractor_clock.h" |
16 #include "chrome/renderer/safe_browsing/features.h" | 16 #include "chrome/renderer/safe_browsing/features.h" |
17 #include "content/public/renderer/render_view.h" | 17 #include "content/public/renderer/render_view.h" |
18 #include "net/base/registry_controlled_domains/registry_controlled_domain.h" | 18 #include "net/base/registry_controlled_domains/registry_controlled_domain.h" |
19 #include "third_party/WebKit/public/platform/WebString.h" | 19 #include "third_party/WebKit/public/platform/WebString.h" |
20 #include "third_party/WebKit/public/web/WebElement.h" | 20 #include "third_party/WebKit/public/web/WebElement.h" |
| 21 #include "third_party/WebKit/public/web/WebElementCollection.h" |
21 #include "third_party/WebKit/public/web/WebFrame.h" | 22 #include "third_party/WebKit/public/web/WebFrame.h" |
22 #include "third_party/WebKit/public/web/WebNodeCollection.h" | |
23 #include "third_party/WebKit/public/web/WebView.h" | 23 #include "third_party/WebKit/public/web/WebView.h" |
24 | 24 |
25 namespace safe_browsing { | 25 namespace safe_browsing { |
26 | 26 |
27 // This time should be short enough that it doesn't noticeably disrupt the | 27 // This time should be short enough that it doesn't noticeably disrupt the |
28 // user's interaction with the page. | 28 // user's interaction with the page. |
29 const int PhishingDOMFeatureExtractor::kMaxTimePerChunkMs = 10; | 29 const int PhishingDOMFeatureExtractor::kMaxTimePerChunkMs = 10; |
30 | 30 |
31 // Experimenting shows that we get a reasonable gain in performance by | 31 // Experimenting shows that we get a reasonable gain in performance by |
32 // increasing this up to around 10, but there's not much benefit in | 32 // increasing this up to around 10, but there's not much benefit in |
(...skipping 52 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
85 start_time(start_time_ticks), | 85 start_time(start_time_ticks), |
86 num_iterations(0) {} | 86 num_iterations(0) {} |
87 | 87 |
88 ~PageFeatureState() {} | 88 ~PageFeatureState() {} |
89 }; | 89 }; |
90 | 90 |
91 // Per-frame state | 91 // Per-frame state |
92 struct PhishingDOMFeatureExtractor::FrameData { | 92 struct PhishingDOMFeatureExtractor::FrameData { |
93 // This is our reference to document.all, which is an iterator over all | 93 // This is our reference to document.all, which is an iterator over all |
94 // of the elements in the document. It keeps track of our current position. | 94 // of the elements in the document. It keeps track of our current position. |
95 blink::WebNodeCollection elements; | 95 blink::WebElementCollection elements; |
96 // The domain of the document URL, stored here so that we don't need to | 96 // The domain of the document URL, stored here so that we don't need to |
97 // recompute it every time it's needed. | 97 // recompute it every time it's needed. |
98 std::string domain; | 98 std::string domain; |
99 }; | 99 }; |
100 | 100 |
101 PhishingDOMFeatureExtractor::PhishingDOMFeatureExtractor( | 101 PhishingDOMFeatureExtractor::PhishingDOMFeatureExtractor( |
102 content::RenderView* render_view, | 102 content::RenderView* render_view, |
103 FeatureExtractorClock* clock) | 103 FeatureExtractorClock* clock) |
104 : render_view_(render_view), | 104 : render_view_(render_view), |
105 clock_(clock), | 105 clock_(clock), |
(...skipping 45 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
151 | 151 |
152 if (cur_document_.isNull()) { | 152 if (cur_document_.isNull()) { |
153 // This will only happen if we weren't able to get the document for the | 153 // This will only happen if we weren't able to get the document for the |
154 // main frame. We'll treat this as an extraction failure. | 154 // main frame. We'll treat this as an extraction failure. |
155 RunCallback(false); | 155 RunCallback(false); |
156 return; | 156 return; |
157 } | 157 } |
158 | 158 |
159 int num_elements = 0; | 159 int num_elements = 0; |
160 for (; !cur_document_.isNull(); cur_document_ = GetNextDocument()) { | 160 for (; !cur_document_.isNull(); cur_document_ = GetNextDocument()) { |
161 blink::WebNode cur_node; | 161 blink::WebElement cur_element; |
162 if (cur_frame_data_.get()) { | 162 if (cur_frame_data_.get()) { |
163 // We're resuming traversal of a frame, so just advance to the next node. | 163 // We're resuming traversal of a frame, so just advance to the next |
164 cur_node = cur_frame_data_->elements.nextItem(); | 164 // element. |
| 165 cur_element = cur_frame_data_->elements.nextItem(); |
165 // When we resume the traversal, the first call to nextItem() potentially | 166 // When we resume the traversal, the first call to nextItem() potentially |
166 // has to walk through the document again from the beginning, if it was | 167 // has to walk through the document again from the beginning, if it was |
167 // modified between our chunks of work. Log how long this takes, so we | 168 // modified between our chunks of work. Log how long this takes, so we |
168 // can tell if it's too slow. | 169 // can tell if it's too slow. |
169 UMA_HISTOGRAM_TIMES("SBClientPhishing.DOMFeatureResumeTime", | 170 UMA_HISTOGRAM_TIMES("SBClientPhishing.DOMFeatureResumeTime", |
170 clock_->Now() - current_chunk_start_time); | 171 clock_->Now() - current_chunk_start_time); |
171 } else { | 172 } else { |
172 // We just moved to a new frame, so update our frame state | 173 // We just moved to a new frame, so update our frame state |
173 // and advance to the first element. | 174 // and advance to the first element. |
174 ResetFrameData(); | 175 ResetFrameData(); |
175 cur_node = cur_frame_data_->elements.firstItem(); | 176 cur_element = cur_frame_data_->elements.firstItem(); |
176 } | 177 } |
177 | 178 |
178 for (; !cur_node.isNull(); | 179 for (; !cur_element.isNull(); |
179 cur_node = cur_frame_data_->elements.nextItem()) { | 180 cur_element = cur_frame_data_->elements.nextItem()) { |
180 if (!cur_node.isElementNode()) { | 181 if (cur_element.hasTagName("a")) { |
181 continue; | 182 HandleLink(cur_element); |
182 } | 183 } else if (cur_element.hasTagName("form")) { |
183 blink::WebElement element = cur_node.to<blink::WebElement>(); | 184 HandleForm(cur_element); |
184 if (element.hasTagName("a")) { | 185 } else if (cur_element.hasTagName("img")) { |
185 HandleLink(element); | 186 HandleImage(cur_element); |
186 } else if (element.hasTagName("form")) { | 187 } else if (cur_element.hasTagName("input")) { |
187 HandleForm(element); | 188 HandleInput(cur_element); |
188 } else if (element.hasTagName("img")) { | 189 } else if (cur_element.hasTagName("script")) { |
189 HandleImage(element); | 190 HandleScript(cur_element); |
190 } else if (element.hasTagName("input")) { | |
191 HandleInput(element); | |
192 } else if (element.hasTagName("script")) { | |
193 HandleScript(element); | |
194 } | 191 } |
195 | 192 |
196 if (++num_elements >= kClockCheckGranularity) { | 193 if (++num_elements >= kClockCheckGranularity) { |
197 num_elements = 0; | 194 num_elements = 0; |
198 base::TimeTicks now = clock_->Now(); | 195 base::TimeTicks now = clock_->Now(); |
199 if (now - page_feature_state_->start_time >= | 196 if (now - page_feature_state_->start_time >= |
200 base::TimeDelta::FromMilliseconds(kMaxTotalTimeMs)) { | 197 base::TimeDelta::FromMilliseconds(kMaxTotalTimeMs)) { |
201 DLOG(ERROR) << "Feature extraction took too long, giving up"; | 198 DLOG(ERROR) << "Feature extraction took too long, giving up"; |
202 // We expect this to happen infrequently, so record when it does. | 199 // We expect this to happen infrequently, so record when it does. |
203 UMA_HISTOGRAM_COUNTS("SBClientPhishing.DOMFeatureTimeout", 1); | 200 UMA_HISTOGRAM_COUNTS("SBClientPhishing.DOMFeatureTimeout", 1); |
(...skipping 287 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
491 // Record number of script tags (discretized for numerical stability.) | 488 // Record number of script tags (discretized for numerical stability.) |
492 if (page_feature_state_->num_script_tags > 1) { | 489 if (page_feature_state_->num_script_tags > 1) { |
493 features_->AddBooleanFeature(features::kPageNumScriptTagsGTOne); | 490 features_->AddBooleanFeature(features::kPageNumScriptTagsGTOne); |
494 if (page_feature_state_->num_script_tags > 6) { | 491 if (page_feature_state_->num_script_tags > 6) { |
495 features_->AddBooleanFeature(features::kPageNumScriptTagsGTSix); | 492 features_->AddBooleanFeature(features::kPageNumScriptTagsGTSix); |
496 } | 493 } |
497 } | 494 } |
498 } | 495 } |
499 | 496 |
500 } // namespace safe_browsing | 497 } // namespace safe_browsing |
OLD | NEW |