Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(1073)

Unified Diff: chrome/renderer/safe_browsing/threat_dom_details.cc

Issue 2713233002: Update ThreatDOMDetails to be able to collect non-resource HTML Elements based on their attributes. (Closed)
Patch Set: Created 3 years, 10 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: chrome/renderer/safe_browsing/threat_dom_details.cc
diff --git a/chrome/renderer/safe_browsing/threat_dom_details.cc b/chrome/renderer/safe_browsing/threat_dom_details.cc
index 45e314523497be9fce453322cae6fb9fc47d7dc5..a96907b3d6e67f5ab1747a77b6942f6911330fd0 100644
--- a/chrome/renderer/safe_browsing/threat_dom_details.cc
+++ b/chrome/renderer/safe_browsing/threat_dom_details.cc
@@ -5,8 +5,12 @@
#include "chrome/renderer/safe_browsing/threat_dom_details.h"
#include <map>
+#include <unordered_set>
#include "base/compiler_specific.h"
+#include "base/metrics/field_trial_params.h"
+#include "base/strings/string_piece.h"
+#include "base/strings/string_split.h"
#include "base/strings/stringprintf.h"
#include "components/safe_browsing/common/safebrowsing_messages.h"
#include "content/public/renderer/render_frame.h"
@@ -24,54 +28,145 @@ namespace safe_browsing {
// maintain proper parent/child relationships.
// They key is a WebNode from the DOM, which is basically a pointer so can be
// copied into the map when inserting new elements.
-// The values are pointers to IPC messages generated by ThreatDOMDetails. They
-// are not owned by the map - ownership remains with the vector of resources
-// collected by this class.
-typedef std::map<blink::WebNode, SafeBrowsingHostMsg_ThreatDOMDetails_Node*>
- ElementToNodeMap;
+// The values are indices into the resource vector, and are used to retrieve IPC
+// messages generated by ThreatDOMDetails.
+using ElementToNodeMap = std::map<blink::WebNode, int>;
+
+// This Feature specifies which HTML Elements to collect based on their tag and
Jialiu Lin 2017/02/25 00:46:18 In addition to "iframe", "frame", "embed" and "scr
lpz 2017/02/27 15:46:43 clarified that it's only for "non-resource" elemen
+// attributes. It's a single param containing a comma-separated list of pairs.
+// For example: "tag1,id,tag1,height,tag2,foo" - this will collect elements with
+// tag "tag1" that have attribute "id" or "height" set, and elements of tag
+// "tag2" if they have attribute "foo" set.
+// All tag names and attributes should be lower case.
+const base::Feature kThreatDomDetailsTagAndAttributeFeature{
+ "ThreatDomDetailsTagAttributes", base::FEATURE_DISABLED_BY_DEFAULT};
+
+// The name of the param containing the tags and attributes list.
+const char kTagAndAttributeParamName[] = "tag_attribute_csv";
+
+// A map containing the attributes of interest for some tag. The key is a tag
+// name and the value is a collection of attribute names. If a tag-attribute
+// pair exists in this map, then it should be collected by ThreatDOMDetails.
+using TagToAttributesMap =
+ std::map<std::string, std::unordered_set<std::string>>;
vakh (use Gerrit instead) 2017/02/24 19:08:38 You might want to consider using: std::map<std::st
lpz 2017/02/27 15:46:43 Thanks - good point about using a vector, I don't
vakh (use Gerrit instead) 2017/02/27 17:24:35 The advantage is the same: cache locality (big win
lpz 2017/02/28 22:53:28 Thanks Varun, PTAL. Re: lifetime, yes I believe t
vakh (use Gerrit instead) 2017/02/28 23:00:04 Acknowledged. Thanks for evaluating both approache
namespace {
+void ParseTagAndAttributeParams(TagToAttributesMap* tag_to_attributes_map) {
+ if (!base::FeatureList::IsEnabled(kThreatDomDetailsTagAndAttributeFeature)) {
+ return;
+ }
+ if (!tag_to_attributes_map) {
vakh (use Gerrit instead) 2017/02/24 19:08:38 Would this condition mask a real problem? It can o
lpz 2017/02/27 15:46:43 Yes, switched to a dcheck
+ return;
+ }
+ tag_to_attributes_map->clear();
+ const std::string& tag_attribute_csv_param =
+ base::GetFieldTrialParamValueByFeature(
+ kThreatDomDetailsTagAndAttributeFeature, kTagAndAttributeParamName);
+ if (tag_attribute_csv_param.empty()) {
+ return;
+ }
+
+ std::vector<std::string> split =
+ base::SplitString(tag_attribute_csv_param, ",", base::TRIM_WHITESPACE,
+ base::SPLIT_WANT_NONEMPTY);
+ for (size_t i = 0; i < split.size(); i += 2) {
Jialiu Lin 2017/02/25 00:46:18 Maybe do a DCHECK if split.size() is an even numbe
vakh (use Gerrit instead) 2017/02/25 00:48:07 Good idea, but if you do this, might as well do th
lpz 2017/02/27 15:46:43 Done.
+ if (i == split.size() - 1) {
+ // We're at the end of the vector but don't have enough elements for
+ // a pair, exit now and ignore this last tag.
+ break;
+ }
+ (*tag_to_attributes_map)[split[i]].insert(split[i + 1]);
+ }
+}
+
+SafeBrowsingHostMsg_ThreatDOMDetails_Node* GetNodeForElement(
+ const blink::WebNode& element,
+ const safe_browsing::ElementToNodeMap& element_to_node_map,
+ std::vector<SafeBrowsingHostMsg_ThreatDOMDetails_Node>* resources) {
+ DCHECK(element_to_node_map.count(element) > 0);
+ int resource_index = element_to_node_map.at(element);
+ return &(resources->at(resource_index));
+}
+
// Handler for the various HTML elements that we extract URLs from.
void HandleElement(
const blink::WebElement& element,
- SafeBrowsingHostMsg_ThreatDOMDetails_Node* parent_node,
+ SafeBrowsingHostMsg_ThreatDOMDetails_Node* summary_node,
std::vector<SafeBrowsingHostMsg_ThreatDOMDetails_Node>* resources,
safe_browsing::ElementToNodeMap* element_to_node_map) {
- if (!element.hasAttribute("src"))
- return;
-
// Retrieve the link and resolve the link in case it's relative.
blink::WebURL full_url =
element.document().completeURL(element.getAttribute("src"));
const GURL& child_url = GURL(full_url);
- // Add to the parent node.
- parent_node->children.push_back(child_url);
+ // Update summary node with the URL if this element has one.
vakh (use Gerrit instead) 2017/02/24 19:08:38 Code is clear enough so the comment is not particu
lpz 2017/02/27 15:46:43 Done.
+ if (!child_url.is_empty() && child_url.is_valid()) {
+ summary_node->children.push_back(child_url);
+ }
// Create the child node.
vakh (use Gerrit instead) 2017/02/24 19:08:38 Same here
lpz 2017/02/27 15:46:43 Done.
- resources->push_back(SafeBrowsingHostMsg_ThreatDOMDetails_Node());
- SafeBrowsingHostMsg_ThreatDOMDetails_Node* child_node = &resources->back();
- child_node->url = child_url;
- child_node->tag_name = element.tagName().utf8();
- child_node->parent = parent_node->url;
+ SafeBrowsingHostMsg_ThreatDOMDetails_Node child_node;
+ child_node.url = child_url;
+ child_node.tag_name = element.tagName().utf8();
+ child_node.parent = summary_node->url;
// Update the ID mapping. First generate the ID for the current node.
// Then, if its parent is available, set the current node's parent ID, and
// also update the parent's children with the current node's ID.
const int child_id = element_to_node_map->size() + 1;
- child_node->node_id = child_id;
- if (!element.parentNode().isNull()) {
- auto parent_node_iter = element_to_node_map->find(element.parentNode());
- if (parent_node_iter != element_to_node_map->end()) {
- child_node->parent_node_id = parent_node->node_id;
+ child_node.node_id = child_id;
+ blink::WebNode cur_parent_element = element.parentNode();
+ while (!cur_parent_element.isNull()) {
+ if (element_to_node_map->count(cur_parent_element) > 0) {
+ SafeBrowsingHostMsg_ThreatDOMDetails_Node* parent_node =
+ GetNodeForElement(cur_parent_element, *element_to_node_map,
+ resources);
+ child_node.parent_node_id = parent_node->node_id;
parent_node->child_node_ids.push_back(child_id);
+
+ // TODO(lpz): Consider also updating the URL-level parent/child mapping
+ // here. Eg: child_node.parent=parent_node.url, and
+ // parent_node.children.push_back(child_url).
+ break;
+ } else {
+ // It's possible that the direct parent of this node wasn't handled, so it
+ // isn't represented in |element_to_node_map|. Try walking up the
+ // hierarchy to see if a parent further up was handled.
+ cur_parent_element = cur_parent_element.parentNode();
}
}
- (*element_to_node_map)[element] = child_node;
+ // Add the child node to the list of resources.
+ resources->push_back(child_node);
+ // .. and remember which index it was inserted at so we can look it up later.
+ (*element_to_node_map)[element] = resources->size() - 1;
}
+bool ShouldHandleElement(const blink::WebElement& element,
+ const TagToAttributesMap& tag_to_attribute_map) {
vakh (use Gerrit instead) 2017/02/24 19:08:38 nit: s/tag_to_attribute_map/tag_to_attributes_map
lpz 2017/02/27 15:46:43 Done.
+ // Resources with a SRC are always handled.
+ if ((element.hasHTMLTagName("iframe") || element.hasHTMLTagName("frame") ||
+ element.hasHTMLTagName("embed") || element.hasHTMLTagName("script")) &&
+ element.hasAttribute("src")) {
+ return true;
+ }
+
+ std::string tag_name_lower = base::ToLowerASCII(element.tagName().ascii());
+ const auto& tag_attribute_iter = tag_to_attribute_map.find(tag_name_lower);
+ if (tag_attribute_iter == tag_to_attribute_map.end()) {
+ return false;
+ }
+
+ const std::unordered_set<std::string>& valid_attributes =
+ tag_attribute_iter->second;
+ for (const std::string& attribute : valid_attributes) {
+ if (element.hasAttribute(blink::WebString::fromASCII(attribute))) {
+ return true;
+ }
+ }
+ return false;
+}
} // namespace
// An upper limit on the number of nodes we collect.
@@ -120,12 +215,14 @@ void ThreatDOMDetails::ExtractResources(
return;
}
+ TagToAttributesMap tag_to_attributes_map;
+ ParseTagAndAttributeParams(&tag_to_attributes_map);
vakh (use Gerrit instead) 2017/02/27 17:24:35 Can't this be done just once, during init/start-up
lpz 2017/02/28 22:53:28 Done.
+
ElementToNodeMap element_to_node_map;
blink::WebElementCollection elements = document.all();
blink::WebElement element = elements.firstItem();
for (; !element.isNull(); element = elements.nextItem()) {
- if (element.hasHTMLTagName("iframe") || element.hasHTMLTagName("frame") ||
- element.hasHTMLTagName("embed") || element.hasHTMLTagName("script")) {
+ if (ShouldHandleElement(element, tag_to_attributes_map)) {
HandleElement(element, &details_node, resources, &element_to_node_map);
if (resources->size() >= kMaxNodes) {
// We have reached kMaxNodes, exit early.

Powered by Google App Engine
This is Rietveld 408576698