| Index: chrome/renderer/safe_browsing/threat_dom_details.cc
 | 
| diff --git a/chrome/renderer/safe_browsing/threat_dom_details.cc b/chrome/renderer/safe_browsing/threat_dom_details.cc
 | 
| index 45e314523497be9fce453322cae6fb9fc47d7dc5..671e2bbdc91523cdfa6e7e9902204fe0e3042cfc 100644
 | 
| --- a/chrome/renderer/safe_browsing/threat_dom_details.cc
 | 
| +++ b/chrome/renderer/safe_browsing/threat_dom_details.cc
 | 
| @@ -4,9 +4,14 @@
 | 
|  
 | 
|  #include "chrome/renderer/safe_browsing/threat_dom_details.h"
 | 
|  
 | 
| +#include <algorithm>
 | 
|  #include <map>
 | 
| +#include <unordered_set>
 | 
|  
 | 
|  #include "base/compiler_specific.h"
 | 
| +#include "base/metrics/field_trial_params.h"
 | 
| +#include "base/strings/string_piece.h"
 | 
| +#include "base/strings/string_split.h"
 | 
|  #include "base/strings/stringprintf.h"
 | 
|  #include "components/safe_browsing/common/safebrowsing_messages.h"
 | 
|  #include "content/public/renderer/render_frame.h"
 | 
| @@ -24,56 +29,174 @@ namespace safe_browsing {
 | 
|  // maintain proper parent/child relationships.
 | 
|  // They key is a WebNode from the DOM, which is basically a pointer so can be
 | 
|  // copied into the map when inserting new elements.
 | 
| -// The values are pointers to IPC messages generated by ThreatDOMDetails. They
 | 
| -// are not owned by the map - ownership remains with the vector of resources
 | 
| -// collected by this class.
 | 
| -typedef std::map<blink::WebNode, SafeBrowsingHostMsg_ThreatDOMDetails_Node*>
 | 
| -    ElementToNodeMap;
 | 
| +// The values are indices into the resource vector, and are used to retrieve IPC
 | 
| +// messages generated by ThreatDOMDetails.
 | 
| +using ElementToNodeMap = std::map<blink::WebNode, int>;
 | 
| +
 | 
| +// This Feature specifies which non-resource HTML Elements to collect based on
 | 
| +// their tag and attributes. It's a single param containing a comma-separated
 | 
| +// list of pairs. For example: "tag1,id,tag1,height,tag2,foo" - this will
 | 
| +// collect elements with tag "tag1" that have attribute "id" or "height" set,
 | 
| +// and elements of tag "tag2" if they have attribute "foo" set. All tag names
 | 
| +// and attributes should be lower case.
 | 
| +const base::Feature kThreatDomDetailsTagAndAttributeFeature{
 | 
| +    "ThreatDomDetailsTagAttributes", base::FEATURE_DISABLED_BY_DEFAULT};
 | 
| +
 | 
| +// The name of the param containing the tags and attributes list.
 | 
| +const char kTagAndAttributeParamName[] = "tag_attribute_csv";
 | 
|  
 | 
|  namespace {
 | 
|  
 | 
| +// Predicate used to search |tag_and_attributes_list_| by tag_name.
 | 
| +class TagNameIs {
 | 
| + public:
 | 
| +  explicit TagNameIs(const std::string& tag) : tag_(tag) {}
 | 
| +  bool operator()(const TagAndAttributesItem& tag_and_attribute) {
 | 
| +    return tag_ == tag_and_attribute.tag_name;
 | 
| +  }
 | 
| +
 | 
| + private:
 | 
| +  std::string tag_;
 | 
| +};
 | 
| +
 | 
| +void ParseTagAndAttributeParams(
 | 
| +    std::vector<TagAndAttributesItem>* tag_and_attributes_list) {
 | 
| +  DCHECK(tag_and_attributes_list);
 | 
| +  if (!base::FeatureList::IsEnabled(kThreatDomDetailsTagAndAttributeFeature)) {
 | 
| +    return;
 | 
| +  }
 | 
| +  tag_and_attributes_list->clear();
 | 
| +  const std::string& tag_attribute_csv_param =
 | 
| +      base::GetFieldTrialParamValueByFeature(
 | 
| +          kThreatDomDetailsTagAndAttributeFeature, kTagAndAttributeParamName);
 | 
| +  if (tag_attribute_csv_param.empty()) {
 | 
| +    return;
 | 
| +  }
 | 
| +
 | 
| +  std::vector<std::string> split =
 | 
| +      base::SplitString(tag_attribute_csv_param, ",", base::TRIM_WHITESPACE,
 | 
| +                        base::SPLIT_WANT_NONEMPTY);
 | 
| +  // If we don't have the right number of pairs in the csv then don't bother
 | 
| +  // parsing further.
 | 
| +  if (split.size() % 2 != 0) {
 | 
| +    return;
 | 
| +  }
 | 
| +  for (size_t i = 0; i < split.size(); i += 2) {
 | 
| +    const std::string& tag_name = split[i];
 | 
| +    const std::string& attribute = split[i + 1];
 | 
| +    auto item_iter =
 | 
| +        std::find_if(tag_and_attributes_list->begin(),
 | 
| +                     tag_and_attributes_list->end(), TagNameIs(tag_name));
 | 
| +    if (item_iter == tag_and_attributes_list->end()) {
 | 
| +      TagAndAttributesItem item;
 | 
| +      item.tag_name = tag_name;
 | 
| +      item.attributes.push_back(attribute);
 | 
| +      tag_and_attributes_list->push_back(item);
 | 
| +    } else {
 | 
| +      item_iter->attributes.push_back(attribute);
 | 
| +    }
 | 
| +  }
 | 
| +
 | 
| +  std::sort(tag_and_attributes_list->begin(), tag_and_attributes_list->end(),
 | 
| +            [](const TagAndAttributesItem& a, const TagAndAttributesItem& b) {
 | 
| +              return a.tag_name < b.tag_name;
 | 
| +            });
 | 
| +}
 | 
| +
 | 
| +SafeBrowsingHostMsg_ThreatDOMDetails_Node* GetNodeForElement(
 | 
| +    const blink::WebNode& element,
 | 
| +    const safe_browsing::ElementToNodeMap& element_to_node_map,
 | 
| +    std::vector<SafeBrowsingHostMsg_ThreatDOMDetails_Node>* resources) {
 | 
| +  DCHECK(element_to_node_map.count(element) > 0);
 | 
| +  int resource_index = element_to_node_map.at(element);
 | 
| +  return &(resources->at(resource_index));
 | 
| +}
 | 
| +
 | 
|  // Handler for the various HTML elements that we extract URLs from.
 | 
|  void HandleElement(
 | 
|      const blink::WebElement& element,
 | 
| -    SafeBrowsingHostMsg_ThreatDOMDetails_Node* parent_node,
 | 
| +    SafeBrowsingHostMsg_ThreatDOMDetails_Node* summary_node,
 | 
|      std::vector<SafeBrowsingHostMsg_ThreatDOMDetails_Node>* resources,
 | 
|      safe_browsing::ElementToNodeMap* element_to_node_map) {
 | 
| -  if (!element.hasAttribute("src"))
 | 
| -    return;
 | 
| -
 | 
|    // Retrieve the link and resolve the link in case it's relative.
 | 
|    blink::WebURL full_url =
 | 
|        element.document().completeURL(element.getAttribute("src"));
 | 
|  
 | 
|    const GURL& child_url = GURL(full_url);
 | 
| +  if (!child_url.is_empty() && child_url.is_valid()) {
 | 
| +    summary_node->children.push_back(child_url);
 | 
| +  }
 | 
|  
 | 
| -  // Add to the parent node.
 | 
| -  parent_node->children.push_back(child_url);
 | 
| -
 | 
| -  // Create the child node.
 | 
| -  resources->push_back(SafeBrowsingHostMsg_ThreatDOMDetails_Node());
 | 
| -  SafeBrowsingHostMsg_ThreatDOMDetails_Node* child_node = &resources->back();
 | 
| -  child_node->url = child_url;
 | 
| -  child_node->tag_name = element.tagName().utf8();
 | 
| -  child_node->parent = parent_node->url;
 | 
| +  SafeBrowsingHostMsg_ThreatDOMDetails_Node child_node;
 | 
| +  child_node.url = child_url;
 | 
| +  child_node.tag_name = element.tagName().utf8();
 | 
| +  child_node.parent = summary_node->url;
 | 
|  
 | 
|    // Update the ID mapping. First generate the ID for the current node.
 | 
|    // Then, if its parent is available, set the current node's parent ID, and
 | 
|    // also update the parent's children with the current node's ID.
 | 
|    const int child_id = element_to_node_map->size() + 1;
 | 
| -  child_node->node_id = child_id;
 | 
| -  if (!element.parentNode().isNull()) {
 | 
| -    auto parent_node_iter = element_to_node_map->find(element.parentNode());
 | 
| -    if (parent_node_iter != element_to_node_map->end()) {
 | 
| -      child_node->parent_node_id = parent_node->node_id;
 | 
| +  child_node.node_id = child_id;
 | 
| +  blink::WebNode cur_parent_element = element.parentNode();
 | 
| +  while (!cur_parent_element.isNull()) {
 | 
| +    if (element_to_node_map->count(cur_parent_element) > 0) {
 | 
| +      SafeBrowsingHostMsg_ThreatDOMDetails_Node* parent_node =
 | 
| +          GetNodeForElement(cur_parent_element, *element_to_node_map,
 | 
| +                            resources);
 | 
| +      child_node.parent_node_id = parent_node->node_id;
 | 
|        parent_node->child_node_ids.push_back(child_id);
 | 
| +
 | 
| +      // TODO(lpz): Consider also updating the URL-level parent/child mapping
 | 
| +      // here. Eg: child_node.parent=parent_node.url, and
 | 
| +      // parent_node.children.push_back(child_url).
 | 
| +      break;
 | 
| +    } else {
 | 
| +      // It's possible that the direct parent of this node wasn't handled, so it
 | 
| +      // isn't represented in |element_to_node_map|. Try walking up the
 | 
| +      // hierarchy to see if a parent further up was handled.
 | 
| +      cur_parent_element = cur_parent_element.parentNode();
 | 
|      }
 | 
|    }
 | 
| -  (*element_to_node_map)[element] = child_node;
 | 
| +  // Add the child node to the list of resources.
 | 
| +  resources->push_back(child_node);
 | 
| +  // .. and remember which index it was inserted at so we can look it up later.
 | 
| +  (*element_to_node_map)[element] = resources->size() - 1;
 | 
|  }
 | 
|  
 | 
| +bool ShouldHandleElement(
 | 
| +    const blink::WebElement& element,
 | 
| +    const std::vector<TagAndAttributesItem>& tag_and_attributes_list) {
 | 
| +  // Resources with a SRC are always handled.
 | 
| +  if ((element.hasHTMLTagName("iframe") || element.hasHTMLTagName("frame") ||
 | 
| +       element.hasHTMLTagName("embed") || element.hasHTMLTagName("script")) &&
 | 
| +      element.hasAttribute("src")) {
 | 
| +    return true;
 | 
| +  }
 | 
| +
 | 
| +  std::string tag_name_lower = base::ToLowerASCII(element.tagName().ascii());
 | 
| +  const auto& tag_attribute_iter =
 | 
| +      std::find_if(tag_and_attributes_list.begin(),
 | 
| +                   tag_and_attributes_list.end(), TagNameIs(tag_name_lower));
 | 
| +  if (tag_attribute_iter == tag_and_attributes_list.end()) {
 | 
| +    return false;
 | 
| +  }
 | 
| +
 | 
| +  const std::vector<std::string>& valid_attributes =
 | 
| +      tag_attribute_iter->attributes;
 | 
| +  for (const std::string& attribute : valid_attributes) {
 | 
| +    if (element.hasAttribute(blink::WebString::fromASCII(attribute))) {
 | 
| +      return true;
 | 
| +    }
 | 
| +  }
 | 
| +  return false;
 | 
| +}
 | 
|  }  // namespace
 | 
|  
 | 
| +TagAndAttributesItem::TagAndAttributesItem() {}
 | 
| +TagAndAttributesItem::TagAndAttributesItem(const TagAndAttributesItem& item)
 | 
| +    : tag_name(item.tag_name), attributes(item.attributes) {}
 | 
| +TagAndAttributesItem::~TagAndAttributesItem() {}
 | 
| +
 | 
|  // An upper limit on the number of nodes we collect.
 | 
|  uint32_t ThreatDOMDetails::kMaxNodes = 500;
 | 
|  
 | 
| @@ -85,7 +208,9 @@ ThreatDOMDetails* ThreatDOMDetails::Create(content::RenderFrame* render_frame) {
 | 
|  }
 | 
|  
 | 
|  ThreatDOMDetails::ThreatDOMDetails(content::RenderFrame* render_frame)
 | 
| -    : content::RenderFrameObserver(render_frame) {}
 | 
| +    : content::RenderFrameObserver(render_frame) {
 | 
| +  ParseTagAndAttributeParams(&tag_and_attributes_list_);
 | 
| +}
 | 
|  
 | 
|  ThreatDOMDetails::~ThreatDOMDetails() {}
 | 
|  
 | 
| @@ -124,8 +249,7 @@ void ThreatDOMDetails::ExtractResources(
 | 
|    blink::WebElementCollection elements = document.all();
 | 
|    blink::WebElement element = elements.firstItem();
 | 
|    for (; !element.isNull(); element = elements.nextItem()) {
 | 
| -    if (element.hasHTMLTagName("iframe") || element.hasHTMLTagName("frame") ||
 | 
| -        element.hasHTMLTagName("embed") || element.hasHTMLTagName("script")) {
 | 
| +    if (ShouldHandleElement(element, tag_and_attributes_list_)) {
 | 
|        HandleElement(element, &details_node, resources, &element_to_node_map);
 | 
|        if (resources->size() >= kMaxNodes) {
 | 
|          // We have reached kMaxNodes, exit early.
 | 
| 
 |