chrome/renderer/safe_browsing/threat_dom_details.cc - Issue 2713233002: Update ThreatDOMDetails to be able to collect non-resource HTML Elements based on their attributes.

Unified Diff: chrome/renderer/safe_browsing/threat_dom_details.cc

Issue 2713233002: Update ThreatDOMDetails to be able to collect non-resource HTML Elements based on their attributes. (Closed)

Patch Set: Address comments Created 3 years, 10 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

« chrome/browser/safe_browsing/threat_details.cc ('K') | « chrome/renderer/safe_browsing/threat_dom_details.h ('k') | chrome/renderer/safe_browsing/threat_dom_details_browsertest.cc » ('j') | chrome/renderer/safe_browsing/threat_dom_details_browsertest.cc » ('J')
Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

Index: chrome/renderer/safe_browsing/threat_dom_details.cc

diff --git a/chrome/renderer/safe_browsing/threat_dom_details.cc b/chrome/renderer/safe_browsing/threat_dom_details.cc

index 45e314523497be9fce453322cae6fb9fc47d7dc5..5d99d59ad3843917c541a4ce291c6db5d537d2b3 100644

--- a/chrome/renderer/safe_browsing/threat_dom_details.cc

+++ b/chrome/renderer/safe_browsing/threat_dom_details.cc

@@ -5,8 +5,12 @@

#include "chrome/renderer/safe_browsing/threat_dom_details.h"

#include <map>

+#include <unordered_set>

#include "base/compiler_specific.h"

+#include "base/metrics/field_trial_params.h"

+#include "base/strings/string_piece.h"

+#include "base/strings/string_split.h"

#include "base/strings/stringprintf.h"

#include "components/safe_browsing/common/safebrowsing_messages.h"

#include "content/public/renderer/render_frame.h"

@@ -24,54 +28,138 @@ namespace safe_browsing {

// maintain proper parent/child relationships.

// They key is a WebNode from the DOM, which is basically a pointer so can be

// copied into the map when inserting new elements.

-// The values are pointers to IPC messages generated by ThreatDOMDetails. They

-// are not owned by the map - ownership remains with the vector of resources

-// collected by this class.

-typedef std::map<blink::WebNode, SafeBrowsingHostMsg_ThreatDOMDetails_Node*>

- ElementToNodeMap;

+// The values are indices into the resource vector, and are used to retrieve IPC

+// messages generated by ThreatDOMDetails.

+using ElementToNodeMap = std::map<blink::WebNode, int>;

+// This Feature specifies which non-resource HTML Elements to collect based on

+// their tag and attributes. It's a single param containing a comma-separated

+// list of pairs. For example: "tag1,id,tag1,height,tag2,foo" - this will

Nathan Parker 2017/02/27 23:00:36 Should they be lowercased?

lpz 2017/02/28 22:53:28 Yes - the code and the comment both mention that.

Nathan Parker 2017/02/28 23:27:10 SGTM

+// collect elements with tag "tag1" that have attribute "id" or "height" set,

+// and elements of tag "tag2" if they have attribute "foo" set. All tag names

+// and attributes should be lower case.

+const base::Feature kThreatDomDetailsTagAndAttributeFeature{

+ "ThreatDomDetailsTagAttributes", base::FEATURE_DISABLED_BY_DEFAULT};

+// The name of the param containing the tags and attributes list.

+const char kTagAndAttributeParamName[] = "tag_attribute_csv";

+// A map containing the attributes of interest for some tag. The key is a tag

+// name and the value is a collection of attribute names. If a tag-attribute

+// pair exists in this map, then it should be collected by ThreatDOMDetails.

+using TagToAttributesMap = std::map<std::string, std::vector<std::string>>;

namespace {

+void ParseTagAndAttributeParams(TagToAttributesMap* tag_to_attributes_map) {

+ DCHECK(tag_to_attributes_map);

vakh (use Gerrit instead) 2017/02/27 17:24:35 optional and nit: this will fail on line 59 specta

Nathan Parker 2017/02/27 23:00:36 ...It does add a little value since it'd wouldn't

lpz 2017/02/28 22:53:28 Ack, leaving the dcheck for doc'ing purposes

+ if (!base::FeatureList::IsEnabled(kThreatDomDetailsTagAndAttributeFeature)) {

+ return;

+ }

+ tag_to_attributes_map->clear();

+ const std::string& tag_attribute_csv_param =

+ base::GetFieldTrialParamValueByFeature(

+ kThreatDomDetailsTagAndAttributeFeature, kTagAndAttributeParamName);

+ if (tag_attribute_csv_param.empty()) {

+ return;

+ }

+ std::vector<std::string> split =

+ base::SplitString(tag_attribute_csv_param, ",", base::TRIM_WHITESPACE,

+ base::SPLIT_WANT_NONEMPTY);

+ // If we don't have the right number of pairs in the csv then don't bother

+ // parsing further.

+ if (split.size() % 2 != 0) {

+ return;

+ }

+ for (size_t i = 0; i < split.size(); i += 2) {

+ (*tag_to_attributes_map)[split[i]].push_back(split[i + 1]);

+ }

+SafeBrowsingHostMsg_ThreatDOMDetails_Node* GetNodeForElement(

+ const blink::WebNode& element,

+ const safe_browsing::ElementToNodeMap& element_to_node_map,

+ std::vector<SafeBrowsingHostMsg_ThreatDOMDetails_Node>* resources) {

+ DCHECK(element_to_node_map.count(element) > 0);

+ int resource_index = element_to_node_map.at(element);

+ return &(resources->at(resource_index));

// Handler for the various HTML elements that we extract URLs from.

void HandleElement(

const blink::WebElement& element,

- SafeBrowsingHostMsg_ThreatDOMDetails_Node* parent_node,

+ SafeBrowsingHostMsg_ThreatDOMDetails_Node* summary_node,

std::vector<SafeBrowsingHostMsg_ThreatDOMDetails_Node>* resources,

safe_browsing::ElementToNodeMap* element_to_node_map) {

- if (!element.hasAttribute("src"))

- return;

// Retrieve the link and resolve the link in case it's relative.

blink::WebURL full_url =

element.document().completeURL(element.getAttribute("src"));

Nathan Parker 2017/02/27 23:00:36 What does getAttritribute("src") do if there is no

lpz 2017/02/28 22:53:28 It behaves nicely - returns an empty object (url o

const GURL& child_url = GURL(full_url);

+ if (!child_url.is_empty() && child_url.is_valid()) {

+ summary_node->children.push_back(child_url);

+ }

- // Add to the parent node.

- parent_node->children.push_back(child_url);

- // Create the child node.

- resources->push_back(SafeBrowsingHostMsg_ThreatDOMDetails_Node());

- SafeBrowsingHostMsg_ThreatDOMDetails_Node* child_node = &resources->back();

- child_node->url = child_url;

- child_node->tag_name = element.tagName().utf8();

- child_node->parent = parent_node->url;

+ SafeBrowsingHostMsg_ThreatDOMDetails_Node child_node;

+ child_node.url = child_url;

+ child_node.tag_name = element.tagName().utf8();

+ child_node.parent = summary_node->url;

// Update the ID mapping. First generate the ID for the current node.

// Then, if its parent is available, set the current node's parent ID, and

// also update the parent's children with the current node's ID.

const int child_id = element_to_node_map->size() + 1;

- child_node->node_id = child_id;

- if (!element.parentNode().isNull()) {

- auto parent_node_iter = element_to_node_map->find(element.parentNode());

- if (parent_node_iter != element_to_node_map->end()) {

- child_node->parent_node_id = parent_node->node_id;

+ child_node.node_id = child_id;

+ blink::WebNode cur_parent_element = element.parentNode();

+ while (!cur_parent_element.isNull()) {

+ if (element_to_node_map->count(cur_parent_element) > 0) {

+ SafeBrowsingHostMsg_ThreatDOMDetails_Node* parent_node =

+ GetNodeForElement(cur_parent_element, *element_to_node_map,

+ resources);

+ child_node.parent_node_id = parent_node->node_id;

parent_node->child_node_ids.push_back(child_id);

+ // TODO(lpz): Consider also updating the URL-level parent/child mapping

+ // here. Eg: child_node.parent=parent_node.url, and

+ // parent_node.children.push_back(child_url).

+ break;

+ } else {

+ // It's possible that the direct parent of this node wasn't handled, so it

Nathan Parker 2017/02/27 23:00:36 What's the use case for this? Will this link a sub

lpz 2017/02/28 22:53:28 imagine something like: <div foo> <div style-stu

+ // isn't represented in |element_to_node_map|. Try walking up the

+ // hierarchy to see if a parent further up was handled.

+ cur_parent_element = cur_parent_element.parentNode();

}

- (*element_to_node_map)[element] = child_node;

+ // Add the child node to the list of resources.

+ resources->push_back(child_node);

+ // .. and remember which index it was inserted at so we can look it up later.

+ (*element_to_node_map)[element] = resources->size() - 1;

}

+bool ShouldHandleElement(const blink::WebElement& element,

+ const TagToAttributesMap& tag_to_attributes_map) {

+ // Resources with a SRC are always handled.

+ if ((element.hasHTMLTagName("iframe") || element.hasHTMLTagName("frame") ||

Nathan Parker 2017/02/27 23:00:36 An aside: Are there any other tags we might want b

lpz 2017/02/28 22:53:28 Not sure of the answer but, at a glance, there are

+ element.hasHTMLTagName("embed") || element.hasHTMLTagName("script")) &&

+ element.hasAttribute("src")) {

+ return true;

+ }

+ std::string tag_name_lower = base::ToLowerASCII(element.tagName().ascii());

+ const auto& tag_attribute_iter = tag_to_attributes_map.find(tag_name_lower);

+ if (tag_attribute_iter == tag_to_attributes_map.end()) {

+ return false;

+ }

+ const std::vector<std::string>& valid_attributes = tag_attribute_iter->second;

+ for (const std::string& attribute : valid_attributes) {

+ if (element.hasAttribute(blink::WebString::fromASCII(attribute))) {

+ return true;

+ }

+ return false;

} // namespace

// An upper limit on the number of nodes we collect.

@@ -120,12 +208,14 @@ void ThreatDOMDetails::ExtractResources(

return;

}

+ TagToAttributesMap tag_to_attributes_map;

+ ParseTagAndAttributeParams(&tag_to_attributes_map);

ElementToNodeMap element_to_node_map;

blink::WebElementCollection elements = document.all();

blink::WebElement element = elements.firstItem();

for (; !element.isNull(); element = elements.nextItem()) {

- if (element.hasHTMLTagName("iframe") || element.hasHTMLTagName("frame") ||

- element.hasHTMLTagName("embed") || element.hasHTMLTagName("script")) {

+ if (ShouldHandleElement(element, tag_to_attributes_map)) {

HandleElement(element, &details_node, resources, &element_to_node_map);

if (resources->size() >= kMaxNodes) {

// We have reached kMaxNodes, exit early.