Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(239)

Side by Side Diff: chrome/renderer/safe_browsing/threat_dom_details.cc

Issue 2713233002: Update ThreatDOMDetails to be able to collect non-resource HTML Elements based on their attributes. (Closed)
Patch Set: Created 3 years, 9 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "chrome/renderer/safe_browsing/threat_dom_details.h" 5 #include "chrome/renderer/safe_browsing/threat_dom_details.h"
6 6
7 #include <map> 7 #include <map>
8 #include <unordered_set>
8 9
9 #include "base/compiler_specific.h" 10 #include "base/compiler_specific.h"
11 #include "base/metrics/field_trial_params.h"
12 #include "base/strings/string_piece.h"
13 #include "base/strings/string_split.h"
10 #include "base/strings/stringprintf.h" 14 #include "base/strings/stringprintf.h"
11 #include "components/safe_browsing/common/safebrowsing_messages.h" 15 #include "components/safe_browsing/common/safebrowsing_messages.h"
12 #include "content/public/renderer/render_frame.h" 16 #include "content/public/renderer/render_frame.h"
13 #include "third_party/WebKit/public/platform/WebString.h" 17 #include "third_party/WebKit/public/platform/WebString.h"
14 #include "third_party/WebKit/public/web/WebDocument.h" 18 #include "third_party/WebKit/public/web/WebDocument.h"
15 #include "third_party/WebKit/public/web/WebElement.h" 19 #include "third_party/WebKit/public/web/WebElement.h"
16 #include "third_party/WebKit/public/web/WebElementCollection.h" 20 #include "third_party/WebKit/public/web/WebElementCollection.h"
17 #include "third_party/WebKit/public/web/WebFrame.h" 21 #include "third_party/WebKit/public/web/WebFrame.h"
18 #include "third_party/WebKit/public/web/WebLocalFrame.h" 22 #include "third_party/WebKit/public/web/WebLocalFrame.h"
19 23
20 namespace safe_browsing { 24 namespace safe_browsing {
21 25
22 // A map for keeping track of the identity of DOM Elements, used to generate 26 // A map for keeping track of the identity of DOM Elements, used to generate
23 // unique IDs for each element and lookup elements IDs by parent Element, to 27 // unique IDs for each element and lookup elements IDs by parent Element, to
24 // maintain proper parent/child relationships. 28 // maintain proper parent/child relationships.
25 // They key is a WebNode from the DOM, which is basically a pointer so can be 29 // They key is a WebNode from the DOM, which is basically a pointer so can be
26 // copied into the map when inserting new elements. 30 // copied into the map when inserting new elements.
27 // The values are pointers to IPC messages generated by ThreatDOMDetails. They 31 // The values are indices into the resource vector, and are used to retrieve IPC
28 // are not owned by the map - ownership remains with the vector of resources 32 // messages generated by ThreatDOMDetails.
29 // collected by this class. 33 using ElementToNodeMap = std::map<blink::WebNode, int>;
30 typedef std::map<blink::WebNode, SafeBrowsingHostMsg_ThreatDOMDetails_Node*> 34
31 ElementToNodeMap; 35 // This Feature specifies which HTML Elements to collect based on their tag and
Jialiu Lin 2017/02/25 00:46:18 In addition to "iframe", "frame", "embed" and "scr
lpz 2017/02/27 15:46:43 clarified that it's only for "non-resource" elemen
36 // attributes. It's a single param containing a comma-separated list of pairs.
37 // For example: "tag1,id,tag1,height,tag2,foo" - this will collect elements with
38 // tag "tag1" that have attribute "id" or "height" set, and elements of tag
39 // "tag2" if they have attribute "foo" set.
40 // All tag names and attributes should be lower case.
41 const base::Feature kThreatDomDetailsTagAndAttributeFeature{
42 "ThreatDomDetailsTagAttributes", base::FEATURE_DISABLED_BY_DEFAULT};
43
44 // The name of the param containing the tags and attributes list.
45 const char kTagAndAttributeParamName[] = "tag_attribute_csv";
46
47 // A map containing the attributes of interest for some tag. The key is a tag
48 // name and the value is a collection of attribute names. If a tag-attribute
49 // pair exists in this map, then it should be collected by ThreatDOMDetails.
50 using TagToAttributesMap =
51 std::map<std::string, std::unordered_set<std::string>>;
vakh (use Gerrit instead) 2017/02/24 19:08:38 You might want to consider using: std::map<std::st
lpz 2017/02/27 15:46:43 Thanks - good point about using a vector, I don't
vakh (use Gerrit instead) 2017/02/27 17:24:35 The advantage is the same: cache locality (big win
lpz 2017/02/28 22:53:28 Thanks Varun, PTAL. Re: lifetime, yes I believe t
vakh (use Gerrit instead) 2017/02/28 23:00:04 Acknowledged. Thanks for evaluating both approache
32 52
33 namespace { 53 namespace {
34 54
55 void ParseTagAndAttributeParams(TagToAttributesMap* tag_to_attributes_map) {
56 if (!base::FeatureList::IsEnabled(kThreatDomDetailsTagAndAttributeFeature)) {
57 return;
58 }
59 if (!tag_to_attributes_map) {
vakh (use Gerrit instead) 2017/02/24 19:08:38 Would this condition mask a real problem? It can o
lpz 2017/02/27 15:46:43 Yes, switched to a dcheck
60 return;
61 }
62 tag_to_attributes_map->clear();
63 const std::string& tag_attribute_csv_param =
64 base::GetFieldTrialParamValueByFeature(
65 kThreatDomDetailsTagAndAttributeFeature, kTagAndAttributeParamName);
66 if (tag_attribute_csv_param.empty()) {
67 return;
68 }
69
70 std::vector<std::string> split =
71 base::SplitString(tag_attribute_csv_param, ",", base::TRIM_WHITESPACE,
72 base::SPLIT_WANT_NONEMPTY);
73 for (size_t i = 0; i < split.size(); i += 2) {
Jialiu Lin 2017/02/25 00:46:18 Maybe do a DCHECK if split.size() is an even numbe
vakh (use Gerrit instead) 2017/02/25 00:48:07 Good idea, but if you do this, might as well do th
lpz 2017/02/27 15:46:43 Done.
74 if (i == split.size() - 1) {
75 // We're at the end of the vector but don't have enough elements for
76 // a pair, exit now and ignore this last tag.
77 break;
78 }
79 (*tag_to_attributes_map)[split[i]].insert(split[i + 1]);
80 }
81 }
82
83 SafeBrowsingHostMsg_ThreatDOMDetails_Node* GetNodeForElement(
84 const blink::WebNode& element,
85 const safe_browsing::ElementToNodeMap& element_to_node_map,
86 std::vector<SafeBrowsingHostMsg_ThreatDOMDetails_Node>* resources) {
87 DCHECK(element_to_node_map.count(element) > 0);
88 int resource_index = element_to_node_map.at(element);
89 return &(resources->at(resource_index));
90 }
91
35 // Handler for the various HTML elements that we extract URLs from. 92 // Handler for the various HTML elements that we extract URLs from.
36 void HandleElement( 93 void HandleElement(
37 const blink::WebElement& element, 94 const blink::WebElement& element,
38 SafeBrowsingHostMsg_ThreatDOMDetails_Node* parent_node, 95 SafeBrowsingHostMsg_ThreatDOMDetails_Node* summary_node,
39 std::vector<SafeBrowsingHostMsg_ThreatDOMDetails_Node>* resources, 96 std::vector<SafeBrowsingHostMsg_ThreatDOMDetails_Node>* resources,
40 safe_browsing::ElementToNodeMap* element_to_node_map) { 97 safe_browsing::ElementToNodeMap* element_to_node_map) {
41 if (!element.hasAttribute("src"))
42 return;
43
44 // Retrieve the link and resolve the link in case it's relative. 98 // Retrieve the link and resolve the link in case it's relative.
45 blink::WebURL full_url = 99 blink::WebURL full_url =
46 element.document().completeURL(element.getAttribute("src")); 100 element.document().completeURL(element.getAttribute("src"));
47 101
48 const GURL& child_url = GURL(full_url); 102 const GURL& child_url = GURL(full_url);
49 103
50 // Add to the parent node. 104 // Update summary node with the URL if this element has one.
vakh (use Gerrit instead) 2017/02/24 19:08:38 Code is clear enough so the comment is not particu
lpz 2017/02/27 15:46:43 Done.
51 parent_node->children.push_back(child_url); 105 if (!child_url.is_empty() && child_url.is_valid()) {
106 summary_node->children.push_back(child_url);
107 }
52 108
53 // Create the child node. 109 // Create the child node.
vakh (use Gerrit instead) 2017/02/24 19:08:38 Same here
lpz 2017/02/27 15:46:43 Done.
54 resources->push_back(SafeBrowsingHostMsg_ThreatDOMDetails_Node()); 110 SafeBrowsingHostMsg_ThreatDOMDetails_Node child_node;
55 SafeBrowsingHostMsg_ThreatDOMDetails_Node* child_node = &resources->back(); 111 child_node.url = child_url;
56 child_node->url = child_url; 112 child_node.tag_name = element.tagName().utf8();
57 child_node->tag_name = element.tagName().utf8(); 113 child_node.parent = summary_node->url;
58 child_node->parent = parent_node->url;
59 114
60 // Update the ID mapping. First generate the ID for the current node. 115 // Update the ID mapping. First generate the ID for the current node.
61 // Then, if its parent is available, set the current node's parent ID, and 116 // Then, if its parent is available, set the current node's parent ID, and
62 // also update the parent's children with the current node's ID. 117 // also update the parent's children with the current node's ID.
63 const int child_id = element_to_node_map->size() + 1; 118 const int child_id = element_to_node_map->size() + 1;
64 child_node->node_id = child_id; 119 child_node.node_id = child_id;
65 if (!element.parentNode().isNull()) { 120 blink::WebNode cur_parent_element = element.parentNode();
66 auto parent_node_iter = element_to_node_map->find(element.parentNode()); 121 while (!cur_parent_element.isNull()) {
67 if (parent_node_iter != element_to_node_map->end()) { 122 if (element_to_node_map->count(cur_parent_element) > 0) {
68 child_node->parent_node_id = parent_node->node_id; 123 SafeBrowsingHostMsg_ThreatDOMDetails_Node* parent_node =
124 GetNodeForElement(cur_parent_element, *element_to_node_map,
125 resources);
126 child_node.parent_node_id = parent_node->node_id;
69 parent_node->child_node_ids.push_back(child_id); 127 parent_node->child_node_ids.push_back(child_id);
128
129 // TODO(lpz): Consider also updating the URL-level parent/child mapping
130 // here. Eg: child_node.parent=parent_node.url, and
131 // parent_node.children.push_back(child_url).
132 break;
133 } else {
134 // It's possible that the direct parent of this node wasn't handled, so it
135 // isn't represented in |element_to_node_map|. Try walking up the
136 // hierarchy to see if a parent further up was handled.
137 cur_parent_element = cur_parent_element.parentNode();
70 } 138 }
71 } 139 }
72 (*element_to_node_map)[element] = child_node; 140 // Add the child node to the list of resources.
141 resources->push_back(child_node);
142 // .. and remember which index it was inserted at so we can look it up later.
143 (*element_to_node_map)[element] = resources->size() - 1;
73 } 144 }
74 145
146 bool ShouldHandleElement(const blink::WebElement& element,
147 const TagToAttributesMap& tag_to_attribute_map) {
vakh (use Gerrit instead) 2017/02/24 19:08:38 nit: s/tag_to_attribute_map/tag_to_attributes_map
lpz 2017/02/27 15:46:43 Done.
148 // Resources with a SRC are always handled.
149 if ((element.hasHTMLTagName("iframe") || element.hasHTMLTagName("frame") ||
150 element.hasHTMLTagName("embed") || element.hasHTMLTagName("script")) &&
151 element.hasAttribute("src")) {
152 return true;
153 }
154
155 std::string tag_name_lower = base::ToLowerASCII(element.tagName().ascii());
156 const auto& tag_attribute_iter = tag_to_attribute_map.find(tag_name_lower);
157 if (tag_attribute_iter == tag_to_attribute_map.end()) {
158 return false;
159 }
160
161 const std::unordered_set<std::string>& valid_attributes =
162 tag_attribute_iter->second;
163 for (const std::string& attribute : valid_attributes) {
164 if (element.hasAttribute(blink::WebString::fromASCII(attribute))) {
165 return true;
166 }
167 }
168 return false;
169 }
75 } // namespace 170 } // namespace
76 171
77 // An upper limit on the number of nodes we collect. 172 // An upper limit on the number of nodes we collect.
78 uint32_t ThreatDOMDetails::kMaxNodes = 500; 173 uint32_t ThreatDOMDetails::kMaxNodes = 500;
79 174
80 // static 175 // static
81 ThreatDOMDetails* ThreatDOMDetails::Create(content::RenderFrame* render_frame) { 176 ThreatDOMDetails* ThreatDOMDetails::Create(content::RenderFrame* render_frame) {
82 // Private constructor and public static Create() method to facilitate 177 // Private constructor and public static Create() method to facilitate
83 // stubbing out this class for binary-size reduction purposes. 178 // stubbing out this class for binary-size reduction purposes.
84 return new ThreatDOMDetails(render_frame); 179 return new ThreatDOMDetails(render_frame);
(...skipping 28 matching lines...) Expand all
113 return; 208 return;
114 SafeBrowsingHostMsg_ThreatDOMDetails_Node details_node; 209 SafeBrowsingHostMsg_ThreatDOMDetails_Node details_node;
115 blink::WebDocument document = frame->document(); 210 blink::WebDocument document = frame->document();
116 details_node.url = GURL(document.url()); 211 details_node.url = GURL(document.url());
117 if (document.isNull()) { 212 if (document.isNull()) {
118 // Nothing in this frame. Just report its URL. 213 // Nothing in this frame. Just report its URL.
119 resources->push_back(details_node); 214 resources->push_back(details_node);
120 return; 215 return;
121 } 216 }
122 217
218 TagToAttributesMap tag_to_attributes_map;
219 ParseTagAndAttributeParams(&tag_to_attributes_map);
vakh (use Gerrit instead) 2017/02/27 17:24:35 Can't this be done just once, during init/start-up
lpz 2017/02/28 22:53:28 Done.
220
123 ElementToNodeMap element_to_node_map; 221 ElementToNodeMap element_to_node_map;
124 blink::WebElementCollection elements = document.all(); 222 blink::WebElementCollection elements = document.all();
125 blink::WebElement element = elements.firstItem(); 223 blink::WebElement element = elements.firstItem();
126 for (; !element.isNull(); element = elements.nextItem()) { 224 for (; !element.isNull(); element = elements.nextItem()) {
127 if (element.hasHTMLTagName("iframe") || element.hasHTMLTagName("frame") || 225 if (ShouldHandleElement(element, tag_to_attributes_map)) {
128 element.hasHTMLTagName("embed") || element.hasHTMLTagName("script")) {
129 HandleElement(element, &details_node, resources, &element_to_node_map); 226 HandleElement(element, &details_node, resources, &element_to_node_map);
130 if (resources->size() >= kMaxNodes) { 227 if (resources->size() >= kMaxNodes) {
131 // We have reached kMaxNodes, exit early. 228 // We have reached kMaxNodes, exit early.
132 resources->push_back(details_node); 229 resources->push_back(details_node);
133 return; 230 return;
134 } 231 }
135 } 232 }
136 } 233 }
137 resources->push_back(details_node); 234 resources->push_back(details_node);
138 } 235 }
139 236
140 void ThreatDOMDetails::OnDestruct() { 237 void ThreatDOMDetails::OnDestruct() {
141 delete this; 238 delete this;
142 } 239 }
143 240
144 } // namespace safe_browsing 241 } // namespace safe_browsing
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698