Chromium Code Reviews| Index: chrome/renderer/safe_browsing/threat_dom_details.cc |
| diff --git a/chrome/renderer/safe_browsing/threat_dom_details.cc b/chrome/renderer/safe_browsing/threat_dom_details.cc |
| index 45e314523497be9fce453322cae6fb9fc47d7dc5..a96907b3d6e67f5ab1747a77b6942f6911330fd0 100644 |
| --- a/chrome/renderer/safe_browsing/threat_dom_details.cc |
| +++ b/chrome/renderer/safe_browsing/threat_dom_details.cc |
| @@ -5,8 +5,12 @@ |
| #include "chrome/renderer/safe_browsing/threat_dom_details.h" |
| #include <map> |
| +#include <unordered_set> |
| #include "base/compiler_specific.h" |
| +#include "base/metrics/field_trial_params.h" |
| +#include "base/strings/string_piece.h" |
| +#include "base/strings/string_split.h" |
| #include "base/strings/stringprintf.h" |
| #include "components/safe_browsing/common/safebrowsing_messages.h" |
| #include "content/public/renderer/render_frame.h" |
| @@ -24,54 +28,145 @@ namespace safe_browsing { |
| // maintain proper parent/child relationships. |
| // They key is a WebNode from the DOM, which is basically a pointer so can be |
| // copied into the map when inserting new elements. |
| -// The values are pointers to IPC messages generated by ThreatDOMDetails. They |
| -// are not owned by the map - ownership remains with the vector of resources |
| -// collected by this class. |
| -typedef std::map<blink::WebNode, SafeBrowsingHostMsg_ThreatDOMDetails_Node*> |
| - ElementToNodeMap; |
| +// The values are indices into the resource vector, and are used to retrieve IPC |
| +// messages generated by ThreatDOMDetails. |
| +using ElementToNodeMap = std::map<blink::WebNode, int>; |
| + |
| +// This Feature specifies which HTML Elements to collect based on their tag and |
|
Jialiu Lin
2017/02/25 00:46:18
In addition to "iframe", "frame", "embed" and "scr
lpz
2017/02/27 15:46:43
clarified that it's only for "non-resource" elemen
|
| +// attributes. It's a single param containing a comma-separated list of pairs. |
| +// For example: "tag1,id,tag1,height,tag2,foo" - this will collect elements with |
| +// tag "tag1" that have attribute "id" or "height" set, and elements of tag |
| +// "tag2" if they have attribute "foo" set. |
| +// All tag names and attributes should be lower case. |
| +const base::Feature kThreatDomDetailsTagAndAttributeFeature{ |
| + "ThreatDomDetailsTagAttributes", base::FEATURE_DISABLED_BY_DEFAULT}; |
| + |
| +// The name of the param containing the tags and attributes list. |
| +const char kTagAndAttributeParamName[] = "tag_attribute_csv"; |
| + |
| +// A map containing the attributes of interest for some tag. The key is a tag |
| +// name and the value is a collection of attribute names. If a tag-attribute |
| +// pair exists in this map, then it should be collected by ThreatDOMDetails. |
| +using TagToAttributesMap = |
| + std::map<std::string, std::unordered_set<std::string>>; |
|
vakh (use Gerrit instead)
2017/02/24 19:08:38
You might want to consider using:
std::map<std::st
lpz
2017/02/27 15:46:43
Thanks - good point about using a vector, I don't
vakh (use Gerrit instead)
2017/02/27 17:24:35
The advantage is the same: cache locality (big win
lpz
2017/02/28 22:53:28
Thanks Varun, PTAL.
Re: lifetime, yes I believe t
vakh (use Gerrit instead)
2017/02/28 23:00:04
Acknowledged. Thanks for evaluating both approache
|
| namespace { |
| +void ParseTagAndAttributeParams(TagToAttributesMap* tag_to_attributes_map) { |
| + if (!base::FeatureList::IsEnabled(kThreatDomDetailsTagAndAttributeFeature)) { |
| + return; |
| + } |
| + if (!tag_to_attributes_map) { |
|
vakh (use Gerrit instead)
2017/02/24 19:08:38
Would this condition mask a real problem?
It can o
lpz
2017/02/27 15:46:43
Yes, switched to a dcheck
|
| + return; |
| + } |
| + tag_to_attributes_map->clear(); |
| + const std::string& tag_attribute_csv_param = |
| + base::GetFieldTrialParamValueByFeature( |
| + kThreatDomDetailsTagAndAttributeFeature, kTagAndAttributeParamName); |
| + if (tag_attribute_csv_param.empty()) { |
| + return; |
| + } |
| + |
| + std::vector<std::string> split = |
| + base::SplitString(tag_attribute_csv_param, ",", base::TRIM_WHITESPACE, |
| + base::SPLIT_WANT_NONEMPTY); |
| + for (size_t i = 0; i < split.size(); i += 2) { |
|
Jialiu Lin
2017/02/25 00:46:18
Maybe do a DCHECK if split.size() is an even numbe
vakh (use Gerrit instead)
2017/02/25 00:48:07
Good idea, but if you do this, might as well do th
lpz
2017/02/27 15:46:43
Done.
|
| + if (i == split.size() - 1) { |
| + // We're at the end of the vector but don't have enough elements for |
| + // a pair, exit now and ignore this last tag. |
| + break; |
| + } |
| + (*tag_to_attributes_map)[split[i]].insert(split[i + 1]); |
| + } |
| +} |
| + |
| +SafeBrowsingHostMsg_ThreatDOMDetails_Node* GetNodeForElement( |
| + const blink::WebNode& element, |
| + const safe_browsing::ElementToNodeMap& element_to_node_map, |
| + std::vector<SafeBrowsingHostMsg_ThreatDOMDetails_Node>* resources) { |
| + DCHECK(element_to_node_map.count(element) > 0); |
| + int resource_index = element_to_node_map.at(element); |
| + return &(resources->at(resource_index)); |
| +} |
| + |
| // Handler for the various HTML elements that we extract URLs from. |
| void HandleElement( |
| const blink::WebElement& element, |
| - SafeBrowsingHostMsg_ThreatDOMDetails_Node* parent_node, |
| + SafeBrowsingHostMsg_ThreatDOMDetails_Node* summary_node, |
| std::vector<SafeBrowsingHostMsg_ThreatDOMDetails_Node>* resources, |
| safe_browsing::ElementToNodeMap* element_to_node_map) { |
| - if (!element.hasAttribute("src")) |
| - return; |
| - |
| // Retrieve the link and resolve the link in case it's relative. |
| blink::WebURL full_url = |
| element.document().completeURL(element.getAttribute("src")); |
| const GURL& child_url = GURL(full_url); |
| - // Add to the parent node. |
| - parent_node->children.push_back(child_url); |
| + // Update summary node with the URL if this element has one. |
|
vakh (use Gerrit instead)
2017/02/24 19:08:38
Code is clear enough so the comment is not particu
lpz
2017/02/27 15:46:43
Done.
|
| + if (!child_url.is_empty() && child_url.is_valid()) { |
| + summary_node->children.push_back(child_url); |
| + } |
| // Create the child node. |
|
vakh (use Gerrit instead)
2017/02/24 19:08:38
Same here
lpz
2017/02/27 15:46:43
Done.
|
| - resources->push_back(SafeBrowsingHostMsg_ThreatDOMDetails_Node()); |
| - SafeBrowsingHostMsg_ThreatDOMDetails_Node* child_node = &resources->back(); |
| - child_node->url = child_url; |
| - child_node->tag_name = element.tagName().utf8(); |
| - child_node->parent = parent_node->url; |
| + SafeBrowsingHostMsg_ThreatDOMDetails_Node child_node; |
| + child_node.url = child_url; |
| + child_node.tag_name = element.tagName().utf8(); |
| + child_node.parent = summary_node->url; |
| // Update the ID mapping. First generate the ID for the current node. |
| // Then, if its parent is available, set the current node's parent ID, and |
| // also update the parent's children with the current node's ID. |
| const int child_id = element_to_node_map->size() + 1; |
| - child_node->node_id = child_id; |
| - if (!element.parentNode().isNull()) { |
| - auto parent_node_iter = element_to_node_map->find(element.parentNode()); |
| - if (parent_node_iter != element_to_node_map->end()) { |
| - child_node->parent_node_id = parent_node->node_id; |
| + child_node.node_id = child_id; |
| + blink::WebNode cur_parent_element = element.parentNode(); |
| + while (!cur_parent_element.isNull()) { |
| + if (element_to_node_map->count(cur_parent_element) > 0) { |
| + SafeBrowsingHostMsg_ThreatDOMDetails_Node* parent_node = |
| + GetNodeForElement(cur_parent_element, *element_to_node_map, |
| + resources); |
| + child_node.parent_node_id = parent_node->node_id; |
| parent_node->child_node_ids.push_back(child_id); |
| + |
| + // TODO(lpz): Consider also updating the URL-level parent/child mapping |
| + // here. Eg: child_node.parent=parent_node.url, and |
| + // parent_node.children.push_back(child_url). |
| + break; |
| + } else { |
| + // It's possible that the direct parent of this node wasn't handled, so it |
| + // isn't represented in |element_to_node_map|. Try walking up the |
| + // hierarchy to see if a parent further up was handled. |
| + cur_parent_element = cur_parent_element.parentNode(); |
| } |
| } |
| - (*element_to_node_map)[element] = child_node; |
| + // Add the child node to the list of resources. |
| + resources->push_back(child_node); |
| + // .. and remember which index it was inserted at so we can look it up later. |
| + (*element_to_node_map)[element] = resources->size() - 1; |
| } |
| +bool ShouldHandleElement(const blink::WebElement& element, |
| + const TagToAttributesMap& tag_to_attribute_map) { |
|
vakh (use Gerrit instead)
2017/02/24 19:08:38
nit: s/tag_to_attribute_map/tag_to_attributes_map
lpz
2017/02/27 15:46:43
Done.
|
| + // Resources with a SRC are always handled. |
| + if ((element.hasHTMLTagName("iframe") || element.hasHTMLTagName("frame") || |
| + element.hasHTMLTagName("embed") || element.hasHTMLTagName("script")) && |
| + element.hasAttribute("src")) { |
| + return true; |
| + } |
| + |
| + std::string tag_name_lower = base::ToLowerASCII(element.tagName().ascii()); |
| + const auto& tag_attribute_iter = tag_to_attribute_map.find(tag_name_lower); |
| + if (tag_attribute_iter == tag_to_attribute_map.end()) { |
| + return false; |
| + } |
| + |
| + const std::unordered_set<std::string>& valid_attributes = |
| + tag_attribute_iter->second; |
| + for (const std::string& attribute : valid_attributes) { |
| + if (element.hasAttribute(blink::WebString::fromASCII(attribute))) { |
| + return true; |
| + } |
| + } |
| + return false; |
| +} |
| } // namespace |
| // An upper limit on the number of nodes we collect. |
| @@ -120,12 +215,14 @@ void ThreatDOMDetails::ExtractResources( |
| return; |
| } |
| + TagToAttributesMap tag_to_attributes_map; |
| + ParseTagAndAttributeParams(&tag_to_attributes_map); |
|
vakh (use Gerrit instead)
2017/02/27 17:24:35
Can't this be done just once, during init/start-up
lpz
2017/02/28 22:53:28
Done.
|
| + |
| ElementToNodeMap element_to_node_map; |
| blink::WebElementCollection elements = document.all(); |
| blink::WebElement element = elements.firstItem(); |
| for (; !element.isNull(); element = elements.nextItem()) { |
| - if (element.hasHTMLTagName("iframe") || element.hasHTMLTagName("frame") || |
| - element.hasHTMLTagName("embed") || element.hasHTMLTagName("script")) { |
| + if (ShouldHandleElement(element, tag_to_attributes_map)) { |
| HandleElement(element, &details_node, resources, &element_to_node_map); |
| if (resources->size() >= kMaxNodes) { |
| // We have reached kMaxNodes, exit early. |