Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(551)

Side by Side Diff: chrome/renderer/safe_browsing/threat_dom_details.cc

Issue 2713233002: Update ThreatDOMDetails to be able to collect non-resource HTML Elements based on their attributes. (Closed)
Patch Set: Address comments Created 3 years, 9 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "chrome/renderer/safe_browsing/threat_dom_details.h" 5 #include "chrome/renderer/safe_browsing/threat_dom_details.h"
6 6
7 #include <map> 7 #include <map>
8 #include <unordered_set>
8 9
9 #include "base/compiler_specific.h" 10 #include "base/compiler_specific.h"
11 #include "base/metrics/field_trial_params.h"
12 #include "base/strings/string_piece.h"
13 #include "base/strings/string_split.h"
10 #include "base/strings/stringprintf.h" 14 #include "base/strings/stringprintf.h"
11 #include "components/safe_browsing/common/safebrowsing_messages.h" 15 #include "components/safe_browsing/common/safebrowsing_messages.h"
12 #include "content/public/renderer/render_frame.h" 16 #include "content/public/renderer/render_frame.h"
13 #include "third_party/WebKit/public/platform/WebString.h" 17 #include "third_party/WebKit/public/platform/WebString.h"
14 #include "third_party/WebKit/public/web/WebDocument.h" 18 #include "third_party/WebKit/public/web/WebDocument.h"
15 #include "third_party/WebKit/public/web/WebElement.h" 19 #include "third_party/WebKit/public/web/WebElement.h"
16 #include "third_party/WebKit/public/web/WebElementCollection.h" 20 #include "third_party/WebKit/public/web/WebElementCollection.h"
17 #include "third_party/WebKit/public/web/WebFrame.h" 21 #include "third_party/WebKit/public/web/WebFrame.h"
18 #include "third_party/WebKit/public/web/WebLocalFrame.h" 22 #include "third_party/WebKit/public/web/WebLocalFrame.h"
19 23
20 namespace safe_browsing { 24 namespace safe_browsing {
21 25
22 // A map for keeping track of the identity of DOM Elements, used to generate 26 // A map for keeping track of the identity of DOM Elements, used to generate
23 // unique IDs for each element and lookup elements IDs by parent Element, to 27 // unique IDs for each element and lookup elements IDs by parent Element, to
24 // maintain proper parent/child relationships. 28 // maintain proper parent/child relationships.
25 // They key is a WebNode from the DOM, which is basically a pointer so can be 29 // They key is a WebNode from the DOM, which is basically a pointer so can be
26 // copied into the map when inserting new elements. 30 // copied into the map when inserting new elements.
27 // The values are pointers to IPC messages generated by ThreatDOMDetails. They 31 // The values are indices into the resource vector, and are used to retrieve IPC
28 // are not owned by the map - ownership remains with the vector of resources 32 // messages generated by ThreatDOMDetails.
29 // collected by this class. 33 using ElementToNodeMap = std::map<blink::WebNode, int>;
30 typedef std::map<blink::WebNode, SafeBrowsingHostMsg_ThreatDOMDetails_Node*> 34
31 ElementToNodeMap; 35 // This Feature specifies which non-resource HTML Elements to collect based on
36 // their tag and attributes. It's a single param containing a comma-separated
37 // list of pairs. For example: "tag1,id,tag1,height,tag2,foo" - this will
Nathan Parker 2017/02/27 23:00:36 Should they be lowercased?
lpz 2017/02/28 22:53:28 Yes - the code and the comment both mention that.
Nathan Parker 2017/02/28 23:27:10 SGTM
38 // collect elements with tag "tag1" that have attribute "id" or "height" set,
39 // and elements of tag "tag2" if they have attribute "foo" set. All tag names
40 // and attributes should be lower case.
41 const base::Feature kThreatDomDetailsTagAndAttributeFeature{
42 "ThreatDomDetailsTagAttributes", base::FEATURE_DISABLED_BY_DEFAULT};
43
44 // The name of the param containing the tags and attributes list.
45 const char kTagAndAttributeParamName[] = "tag_attribute_csv";
46
47 // A map containing the attributes of interest for some tag. The key is a tag
48 // name and the value is a collection of attribute names. If a tag-attribute
49 // pair exists in this map, then it should be collected by ThreatDOMDetails.
50 using TagToAttributesMap = std::map<std::string, std::vector<std::string>>;
32 51
33 namespace { 52 namespace {
34 53
54 void ParseTagAndAttributeParams(TagToAttributesMap* tag_to_attributes_map) {
55 DCHECK(tag_to_attributes_map);
vakh (use Gerrit instead) 2017/02/27 17:24:35 optional and nit: this will fail on line 59 specta
Nathan Parker 2017/02/27 23:00:36 ...It does add a little value since it'd wouldn't
lpz 2017/02/28 22:53:28 Ack, leaving the dcheck for doc'ing purposes
56 if (!base::FeatureList::IsEnabled(kThreatDomDetailsTagAndAttributeFeature)) {
57 return;
58 }
59 tag_to_attributes_map->clear();
60 const std::string& tag_attribute_csv_param =
61 base::GetFieldTrialParamValueByFeature(
62 kThreatDomDetailsTagAndAttributeFeature, kTagAndAttributeParamName);
63 if (tag_attribute_csv_param.empty()) {
64 return;
65 }
66
67 std::vector<std::string> split =
68 base::SplitString(tag_attribute_csv_param, ",", base::TRIM_WHITESPACE,
69 base::SPLIT_WANT_NONEMPTY);
70 // If we don't have the right number of pairs in the csv then don't bother
71 // parsing further.
72 if (split.size() % 2 != 0) {
73 return;
74 }
75 for (size_t i = 0; i < split.size(); i += 2) {
76 (*tag_to_attributes_map)[split[i]].push_back(split[i + 1]);
77 }
78 }
79
80 SafeBrowsingHostMsg_ThreatDOMDetails_Node* GetNodeForElement(
81 const blink::WebNode& element,
82 const safe_browsing::ElementToNodeMap& element_to_node_map,
83 std::vector<SafeBrowsingHostMsg_ThreatDOMDetails_Node>* resources) {
84 DCHECK(element_to_node_map.count(element) > 0);
85 int resource_index = element_to_node_map.at(element);
86 return &(resources->at(resource_index));
87 }
88
35 // Handler for the various HTML elements that we extract URLs from. 89 // Handler for the various HTML elements that we extract URLs from.
36 void HandleElement( 90 void HandleElement(
37 const blink::WebElement& element, 91 const blink::WebElement& element,
38 SafeBrowsingHostMsg_ThreatDOMDetails_Node* parent_node, 92 SafeBrowsingHostMsg_ThreatDOMDetails_Node* summary_node,
39 std::vector<SafeBrowsingHostMsg_ThreatDOMDetails_Node>* resources, 93 std::vector<SafeBrowsingHostMsg_ThreatDOMDetails_Node>* resources,
40 safe_browsing::ElementToNodeMap* element_to_node_map) { 94 safe_browsing::ElementToNodeMap* element_to_node_map) {
41 if (!element.hasAttribute("src"))
42 return;
43
44 // Retrieve the link and resolve the link in case it's relative. 95 // Retrieve the link and resolve the link in case it's relative.
45 blink::WebURL full_url = 96 blink::WebURL full_url =
46 element.document().completeURL(element.getAttribute("src")); 97 element.document().completeURL(element.getAttribute("src"));
Nathan Parker 2017/02/27 23:00:36 What does getAttritribute("src") do if there is no
lpz 2017/02/28 22:53:28 It behaves nicely - returns an empty object (url o
47 98
48 const GURL& child_url = GURL(full_url); 99 const GURL& child_url = GURL(full_url);
100 if (!child_url.is_empty() && child_url.is_valid()) {
101 summary_node->children.push_back(child_url);
102 }
49 103
50 // Add to the parent node. 104 SafeBrowsingHostMsg_ThreatDOMDetails_Node child_node;
51 parent_node->children.push_back(child_url); 105 child_node.url = child_url;
52 106 child_node.tag_name = element.tagName().utf8();
53 // Create the child node. 107 child_node.parent = summary_node->url;
54 resources->push_back(SafeBrowsingHostMsg_ThreatDOMDetails_Node());
55 SafeBrowsingHostMsg_ThreatDOMDetails_Node* child_node = &resources->back();
56 child_node->url = child_url;
57 child_node->tag_name = element.tagName().utf8();
58 child_node->parent = parent_node->url;
59 108
60 // Update the ID mapping. First generate the ID for the current node. 109 // Update the ID mapping. First generate the ID for the current node.
61 // Then, if its parent is available, set the current node's parent ID, and 110 // Then, if its parent is available, set the current node's parent ID, and
62 // also update the parent's children with the current node's ID. 111 // also update the parent's children with the current node's ID.
63 const int child_id = element_to_node_map->size() + 1; 112 const int child_id = element_to_node_map->size() + 1;
64 child_node->node_id = child_id; 113 child_node.node_id = child_id;
65 if (!element.parentNode().isNull()) { 114 blink::WebNode cur_parent_element = element.parentNode();
66 auto parent_node_iter = element_to_node_map->find(element.parentNode()); 115 while (!cur_parent_element.isNull()) {
67 if (parent_node_iter != element_to_node_map->end()) { 116 if (element_to_node_map->count(cur_parent_element) > 0) {
68 child_node->parent_node_id = parent_node->node_id; 117 SafeBrowsingHostMsg_ThreatDOMDetails_Node* parent_node =
118 GetNodeForElement(cur_parent_element, *element_to_node_map,
119 resources);
120 child_node.parent_node_id = parent_node->node_id;
69 parent_node->child_node_ids.push_back(child_id); 121 parent_node->child_node_ids.push_back(child_id);
122
123 // TODO(lpz): Consider also updating the URL-level parent/child mapping
124 // here. Eg: child_node.parent=parent_node.url, and
125 // parent_node.children.push_back(child_url).
126 break;
127 } else {
128 // It's possible that the direct parent of this node wasn't handled, so it
Nathan Parker 2017/02/27 23:00:36 What's the use case for this? Will this link a sub
lpz 2017/02/28 22:53:28 imagine something like: <div foo> <div style-stu
129 // isn't represented in |element_to_node_map|. Try walking up the
130 // hierarchy to see if a parent further up was handled.
131 cur_parent_element = cur_parent_element.parentNode();
70 } 132 }
71 } 133 }
72 (*element_to_node_map)[element] = child_node; 134 // Add the child node to the list of resources.
135 resources->push_back(child_node);
136 // .. and remember which index it was inserted at so we can look it up later.
137 (*element_to_node_map)[element] = resources->size() - 1;
73 } 138 }
74 139
140 bool ShouldHandleElement(const blink::WebElement& element,
141 const TagToAttributesMap& tag_to_attributes_map) {
142 // Resources with a SRC are always handled.
143 if ((element.hasHTMLTagName("iframe") || element.hasHTMLTagName("frame") ||
Nathan Parker 2017/02/27 23:00:36 An aside: Are there any other tags we might want b
lpz 2017/02/28 22:53:28 Not sure of the answer but, at a glance, there are
144 element.hasHTMLTagName("embed") || element.hasHTMLTagName("script")) &&
145 element.hasAttribute("src")) {
146 return true;
147 }
148
149 std::string tag_name_lower = base::ToLowerASCII(element.tagName().ascii());
150 const auto& tag_attribute_iter = tag_to_attributes_map.find(tag_name_lower);
151 if (tag_attribute_iter == tag_to_attributes_map.end()) {
152 return false;
153 }
154
155 const std::vector<std::string>& valid_attributes = tag_attribute_iter->second;
156 for (const std::string& attribute : valid_attributes) {
157 if (element.hasAttribute(blink::WebString::fromASCII(attribute))) {
158 return true;
159 }
160 }
161 return false;
162 }
75 } // namespace 163 } // namespace
76 164
77 // An upper limit on the number of nodes we collect. 165 // An upper limit on the number of nodes we collect.
78 uint32_t ThreatDOMDetails::kMaxNodes = 500; 166 uint32_t ThreatDOMDetails::kMaxNodes = 500;
79 167
80 // static 168 // static
81 ThreatDOMDetails* ThreatDOMDetails::Create(content::RenderFrame* render_frame) { 169 ThreatDOMDetails* ThreatDOMDetails::Create(content::RenderFrame* render_frame) {
82 // Private constructor and public static Create() method to facilitate 170 // Private constructor and public static Create() method to facilitate
83 // stubbing out this class for binary-size reduction purposes. 171 // stubbing out this class for binary-size reduction purposes.
84 return new ThreatDOMDetails(render_frame); 172 return new ThreatDOMDetails(render_frame);
(...skipping 28 matching lines...) Expand all
113 return; 201 return;
114 SafeBrowsingHostMsg_ThreatDOMDetails_Node details_node; 202 SafeBrowsingHostMsg_ThreatDOMDetails_Node details_node;
115 blink::WebDocument document = frame->document(); 203 blink::WebDocument document = frame->document();
116 details_node.url = GURL(document.url()); 204 details_node.url = GURL(document.url());
117 if (document.isNull()) { 205 if (document.isNull()) {
118 // Nothing in this frame. Just report its URL. 206 // Nothing in this frame. Just report its URL.
119 resources->push_back(details_node); 207 resources->push_back(details_node);
120 return; 208 return;
121 } 209 }
122 210
211 TagToAttributesMap tag_to_attributes_map;
212 ParseTagAndAttributeParams(&tag_to_attributes_map);
213
123 ElementToNodeMap element_to_node_map; 214 ElementToNodeMap element_to_node_map;
124 blink::WebElementCollection elements = document.all(); 215 blink::WebElementCollection elements = document.all();
125 blink::WebElement element = elements.firstItem(); 216 blink::WebElement element = elements.firstItem();
126 for (; !element.isNull(); element = elements.nextItem()) { 217 for (; !element.isNull(); element = elements.nextItem()) {
127 if (element.hasHTMLTagName("iframe") || element.hasHTMLTagName("frame") || 218 if (ShouldHandleElement(element, tag_to_attributes_map)) {
128 element.hasHTMLTagName("embed") || element.hasHTMLTagName("script")) {
129 HandleElement(element, &details_node, resources, &element_to_node_map); 219 HandleElement(element, &details_node, resources, &element_to_node_map);
130 if (resources->size() >= kMaxNodes) { 220 if (resources->size() >= kMaxNodes) {
131 // We have reached kMaxNodes, exit early. 221 // We have reached kMaxNodes, exit early.
132 resources->push_back(details_node); 222 resources->push_back(details_node);
133 return; 223 return;
134 } 224 }
135 } 225 }
136 } 226 }
137 resources->push_back(details_node); 227 resources->push_back(details_node);
138 } 228 }
139 229
140 void ThreatDOMDetails::OnDestruct() { 230 void ThreatDOMDetails::OnDestruct() {
141 delete this; 231 delete this;
142 } 232 }
143 233
144 } // namespace safe_browsing 234 } // namespace safe_browsing
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698