Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(119)

Side by Side Diff: components/safe_browsing/browser/threat_details.cc

Issue 2837603002: Content API changes to improve DOM stitching in ThreatDetails code. (Closed)
Patch Set: Address feedback Created 3 years, 7 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 // 4 //
5 // Implementation of the ThreatDetails class. 5 // Implementation of the ThreatDetails class.
6 6
7 #include "components/safe_browsing/browser/threat_details.h" 7 #include "components/safe_browsing/browser/threat_details.h"
8 8
9 #include <stddef.h> 9 #include <stddef.h>
10 #include <stdint.h> 10 #include <stdint.h>
11 11
12 #include "base/bind.h" 12 #include "base/bind.h"
13 #include "base/lazy_instance.h" 13 #include "base/lazy_instance.h"
14 #include "base/metrics/histogram_macros.h" 14 #include "base/metrics/histogram_macros.h"
15 #include "base/strings/string_util.h" 15 #include "base/strings/string_util.h"
16 #include "components/history/core/browser/history_service.h" 16 #include "components/history/core/browser/history_service.h"
17 #include "components/safe_browsing/base_ui_manager.h" 17 #include "components/safe_browsing/base_ui_manager.h"
18 #include "components/safe_browsing/browser/threat_details_cache.h" 18 #include "components/safe_browsing/browser/threat_details_cache.h"
19 #include "components/safe_browsing/browser/threat_details_history.h" 19 #include "components/safe_browsing/browser/threat_details_history.h"
20 #include "components/safe_browsing/common/safebrowsing_messages.h" 20 #include "components/safe_browsing/common/safebrowsing_messages.h"
21 #include "content/public/browser/browser_thread.h" 21 #include "content/public/browser/browser_thread.h"
22 #include "content/public/browser/navigation_controller.h" 22 #include "content/public/browser/navigation_controller.h"
23 #include "content/public/browser/navigation_entry.h" 23 #include "content/public/browser/navigation_entry.h"
24 #include "content/public/browser/render_frame_host.h" 24 #include "content/public/browser/render_frame_host.h"
25 #include "content/public/browser/render_process_host.h"
25 #include "content/public/browser/web_contents.h" 26 #include "content/public/browser/web_contents.h"
26 #include "net/url_request/url_request_context_getter.h" 27 #include "net/url_request/url_request_context_getter.h"
27 28
28 using content::BrowserThread; 29 using content::BrowserThread;
29 using content::NavigationEntry; 30 using content::NavigationEntry;
30 using content::RenderFrameHost; 31 using content::RenderFrameHost;
31 using content::WebContents; 32 using content::WebContents;
32 33
33 // Keep in sync with KMaxNodes in components/safe_browsing/renderer/ 34 // Keep in sync with KMaxNodes in components/safe_browsing/renderer/
34 // threat_dom_details.cc 35 // threat_dom_details.cc
(...skipping 244 matching lines...) Expand 10 before | Expand all | Expand 10 after
279 } 280 }
280 if (!duplicate_child) 281 if (!duplicate_child)
281 url_resource->add_child_ids(child_resource->id()); 282 url_resource->add_child_ids(child_resource->id());
282 } 283 }
283 } 284 }
284 285
285 return url_resource; 286 return url_resource;
286 } 287 }
287 288
288 void ThreatDetails::AddDomElement( 289 void ThreatDetails::AddDomElement(
290 const int process_id,
289 const int frame_tree_node_id, 291 const int frame_tree_node_id,
290 const std::string& frame_url, 292 const int child_frame_routing_id,
Charlie Reis 2017/05/10 22:17:49 Both process_id and child_frame_routing_id look un
lpz 2017/05/12 13:53:16 Yep, and some cascading param cleanup from this in
291 const int element_node_id, 293 const int element_node_id,
292 const std::string& tagname, 294 const std::string& tagname,
293 const int parent_element_node_id, 295 const int parent_element_node_id,
294 const std::vector<AttributeNameValue>& attributes, 296 const std::vector<AttributeNameValue>& attributes,
295 const ClientSafeBrowsingReportRequest::Resource* resource) { 297 const ClientSafeBrowsingReportRequest::Resource* resource) {
296 // Create the element. It should not exist already since this function should 298 // Create the element. It should not exist already since this function should
297 // only be called once for each element. 299 // only be called once for each element.
298 const std::string element_key = 300 const std::string element_key =
299 GetElementKey(frame_tree_node_id, element_node_id); 301 GetElementKey(frame_tree_node_id, element_node_id);
300 HTMLElement* cur_element = FindOrCreateElement(element_key); 302 HTMLElement* cur_element = FindOrCreateElement(element_key);
301 303
302 // Set some basic metadata about the element. 304 // Set some basic metadata about the element.
303 const std::string tag_name_upper = base::ToUpperASCII(tagname); 305 const std::string tag_name_upper = base::ToUpperASCII(tagname);
304 if (!tag_name_upper.empty()) { 306 if (!tag_name_upper.empty()) {
305 cur_element->set_tag(tag_name_upper); 307 cur_element->set_tag(tag_name_upper);
306 } 308 }
307 for (const AttributeNameValue& attribute : attributes) { 309 for (const AttributeNameValue& attribute : attributes) {
308 HTMLElement::Attribute* attribute_pb = cur_element->add_attribute(); 310 HTMLElement::Attribute* attribute_pb = cur_element->add_attribute();
309 attribute_pb->set_name(attribute.first); 311 attribute_pb->set_name(attribute.first);
310 attribute_pb->set_value(attribute.second); 312 attribute_pb->set_value(attribute.second);
311 } 313 }
312 bool is_frame = tag_name_upper == "IFRAME" || tag_name_upper == "FRAME";
313 314
314 if (resource) { 315 if (resource) {
315 cur_element->set_resource_id(resource->id()); 316 cur_element->set_resource_id(resource->id());
316
317 // For iframes, remember that this HTML Element represents an iframe with a
318 // specific URL. Elements from a frame with this URL are children of this
319 // element.
320 if (is_frame &&
321 !base::ContainsKey(iframe_src_to_element_map_, resource->url())) {
322 iframe_src_to_element_map_[resource->url()] = cur_element;
323 }
324 } 317 }
325 318
326 // Next we try to lookup the parent of the current element and add ourselves 319 // Next we try to lookup the parent of the current element and add ourselves
327 // as a child of it. 320 // as a child of it.
328 HTMLElement* parent_element = nullptr; 321 HTMLElement* parent_element = nullptr;
329 if (parent_element_node_id == 0) { 322 if (parent_element_node_id == 0) {
330 // No parent indicates that this element is at the top of the current frame. 323 // No parent indicates that this element is at the top of the current frame.
331 // This frame could be a child of an iframe in another frame, or it could be 324 // Remember that this is a top-level element of the frame with the
332 // at the root of the whole page. If we have a frame URL then we can try to 325 // current |frame_tree_node_id|. If this element is inside an iframe, a
333 // map this element to its parent. 326 // second pass will insert this element as a child of its parent iframe.
334 if (!frame_url.empty()) { 327 frame_tree_id_to_children_map_[frame_tree_node_id].insert(
335 // First, remember that this element is at the top-level of a frame with 328 cur_element->id());
336 // our frame URL.
337 document_url_to_children_map_[frame_url].insert(cur_element->id());
338
339 // Now check if the frame URL matches the src URL of an iframe elsewhere.
340 // This means that we processed the parent iframe element earlier, so we
341 // can add ourselves as a child of that iframe.
342 // If no such iframe exists, it could be processed later, or this element
343 // is in the top-level frame and truly has no parent.
344 if (base::ContainsKey(iframe_src_to_element_map_, frame_url)) {
345 parent_element = iframe_src_to_element_map_[frame_url];
346 }
347 }
348 } else { 329 } else {
349 // We have a parent ID, so this element is just a child of something inside 330 // We have a parent ID, so this element is just a child of something inside
350 // of our current frame. We can easily lookup our parent. 331 // of our current frame. We can easily lookup our parent.
351 const std::string& parent_key = 332 const std::string& parent_key =
352 GetElementKey(frame_tree_node_id, parent_element_node_id); 333 GetElementKey(frame_tree_node_id, parent_element_node_id);
353 if (base::ContainsKey(elements_, parent_key)) { 334 if (base::ContainsKey(elements_, parent_key)) {
354 parent_element = elements_[parent_key].get(); 335 parent_element = elements_[parent_key].get();
355 } 336 }
356 } 337 }
357 338
358 // If a parent element was found, add ourselves as a child, ensuring not to 339 // If a parent element was found, add ourselves as a child, ensuring not to
359 // duplicate child IDs. 340 // duplicate child IDs.
360 if (parent_element) { 341 if (parent_element) {
361 bool duplicate_child = false; 342 bool duplicate_child = false;
362 for (const int child_id : parent_element->child_ids()) { 343 for (const int child_id : parent_element->child_ids()) {
363 if (child_id == cur_element->id()) { 344 if (child_id == cur_element->id()) {
364 duplicate_child = true; 345 duplicate_child = true;
365 break; 346 break;
366 } 347 }
367 } 348 }
368 if (!duplicate_child) { 349 if (!duplicate_child) {
369 parent_element->add_child_ids(cur_element->id()); 350 parent_element->add_child_ids(cur_element->id());
370 } 351 }
371 } 352 }
372
373 // Finally, we need to check if the current element is the parent of some
374 // other elements that came in from another frame earlier. This only happens
375 // if we are an iframe, and our src URL exists in
376 // document_url_to_children_map_. If there is a match, then all of the
377 // children in that map belong to us.
378 if (is_frame && resource &&
379 base::ContainsKey(document_url_to_children_map_, resource->url())) {
380 const std::unordered_set<int>& child_ids =
381 document_url_to_children_map_[resource->url()];
382 for (const int child_id : child_ids) {
383 cur_element->add_child_ids(child_id);
384 }
385 }
386 } 353 }
387 354
388 void ThreatDetails::StartCollection() { 355 void ThreatDetails::StartCollection() {
389 DVLOG(1) << "Starting to compute threat details."; 356 DVLOG(1) << "Starting to compute threat details.";
390 report_.reset(new ClientSafeBrowsingReportRequest()); 357 report_.reset(new ClientSafeBrowsingReportRequest());
391 358
392 if (IsReportableUrl(resource_.url)) { 359 if (IsReportableUrl(resource_.url)) {
393 report_->set_url(resource_.url.spec()); 360 report_->set_url(resource_.url.spec());
394 report_->set_type(GetReportTypeFromSBThreatType(resource_.threat_type)); 361 report_->set_type(GetReportTypeFromSBThreatType(resource_.threat_type));
395 } 362 }
(...skipping 49 matching lines...) Expand 10 before | Expand all | Expand 10 after
445 // detail collection could be started once the page loads. 412 // detail collection could be started once the page loads.
446 web_contents()->SendToAllFrames( 413 web_contents()->SendToAllFrames(
447 new SafeBrowsingMsg_GetThreatDOMDetails(MSG_ROUTING_NONE)); 414 new SafeBrowsingMsg_GetThreatDOMDetails(MSG_ROUTING_NONE));
448 } 415 }
449 } 416 }
450 417
451 // When the renderer is done, this is called. 418 // When the renderer is done, this is called.
452 void ThreatDetails::OnReceivedThreatDOMDetails( 419 void ThreatDetails::OnReceivedThreatDOMDetails(
453 content::RenderFrameHost* sender, 420 content::RenderFrameHost* sender,
454 const std::vector<SafeBrowsingHostMsg_ThreatDOMDetails_Node>& params) { 421 const std::vector<SafeBrowsingHostMsg_ThreatDOMDetails_Node>& params) {
422 // Lookup the FrameTreeNodeId of any child frames in the list of DOM nodes.
Charlie Reis 2017/05/10 22:17:49 nit: FrameTreeNode ID
lpz 2017/05/12 13:53:16 Done.
423 const int sender_process_id = sender->GetProcess()->GetID();
424 const int sender_frame_tree_node_id = sender->GetFrameTreeNodeId();
425 KeyToFrameTreeIdMap child_frame_tree_map;
426 for (const SafeBrowsingHostMsg_ThreatDOMDetails_Node& node : params) {
427 if (node.child_frame_routing_id == 0)
428 continue;
429
430 const std::string cur_element_key =
431 GetElementKey(sender_frame_tree_node_id, node.node_id);
432 RenderFrameHost* rfh =
433 content::RenderFrameHost::GetRenderFrameHostForRoutingId(
434 sender_process_id, node.child_frame_routing_id);
435 if (!rfh) {
436 ambiguous_dom_ = true;
437 } else {
438 child_frame_tree_map[cur_element_key] = rfh->GetFrameTreeNodeId();
439 }
440 }
441
455 // Schedule this in IO thread, so it doesn't conflict with future users 442 // Schedule this in IO thread, so it doesn't conflict with future users
456 // of our data structures (eg GetSerializedReport). 443 // of our data structures (eg GetSerializedReport).
457 BrowserThread::PostTask( 444 BrowserThread::PostTask(
458 BrowserThread::IO, FROM_HERE, 445 BrowserThread::IO, FROM_HERE,
459 base::BindOnce(&ThreatDetails::AddDOMDetails, this, 446 base::Bind(&ThreatDetails::AddDOMDetails, this,
460 sender->GetFrameTreeNodeId(), 447 sender->GetProcess()->GetID(), sender->GetFrameTreeNodeId(),
461 sender->GetLastCommittedURL(), params)); 448 sender->GetLastCommittedURL(), params, child_frame_tree_map));
462 } 449 }
463 450
464 void ThreatDetails::AddDOMDetails( 451 void ThreatDetails::AddDOMDetails(
452 const int process_id,
465 const int frame_tree_node_id, 453 const int frame_tree_node_id,
466 const GURL& frame_last_committed_url, 454 const GURL& frame_last_committed_url,
467 const std::vector<SafeBrowsingHostMsg_ThreatDOMDetails_Node>& params) { 455 const std::vector<SafeBrowsingHostMsg_ThreatDOMDetails_Node>& params,
456 const KeyToFrameTreeIdMap& child_frame_tree_map) {
468 DCHECK_CURRENTLY_ON(BrowserThread::IO); 457 DCHECK_CURRENTLY_ON(BrowserThread::IO);
469 DVLOG(1) << "Nodes from the DOM: " << params.size(); 458 DVLOG(1) << "Nodes from the DOM: " << params.size();
470 459
471 // If we have already started getting redirects from history service, 460 // If we have already started getting redirects from history service,
472 // don't modify state, otherwise will invalidate the iterators. 461 // don't modify state, otherwise will invalidate the iterators.
473 if (redirects_collector_->HasStarted()) 462 if (redirects_collector_->HasStarted())
474 return; 463 return;
475 464
476 // If we have already started collecting data from the HTTP cache, don't 465 // If we have already started collecting data from the HTTP cache, don't
477 // modify our state. 466 // modify our state.
478 if (cache_collector_->HasStarted()) 467 if (cache_collector_->HasStarted())
479 return; 468 return;
480 469
481 // Exit early if there are no nodes to process. 470 // Exit early if there are no nodes to process.
482 if (params.empty()) 471 if (params.empty())
483 return; 472 return;
484 473
485 // Try to deduce the URL that the render frame was handling. First check if 474 // Copy FrameTreeNode IDs for the child frame into the combined mapping.
486 // the summary node from the renderer has a document URL. If not, try looking 475 iframe_key_to_frame_tree_id_map_.insert(child_frame_tree_map.begin(),
487 // at the last committed URL of the frame. 476 child_frame_tree_map.end());
488 GURL frame_url;
489 if (IsReportableUrl(params.back().url)) {
490 frame_url = params.back().url;
491 } else if (IsReportableUrl(frame_last_committed_url)) {
492 frame_url = frame_last_committed_url;
493 }
494
495 // If we can't figure out which URL the frame was rendering then we don't know
496 // where these elements belong in the hierarchy. The DOM will be ambiguous.
497 if (frame_url.is_empty()) {
498 ambiguous_dom_ = true;
499 }
500 477
501 // Add the urls from the DOM to |resources_|. The renderer could be sending 478 // Add the urls from the DOM to |resources_|. The renderer could be sending
502 // bogus messages, so limit the number of nodes we accept. 479 // bogus messages, so limit the number of nodes we accept.
503 // Also update |elements_| with the DOM structure. 480 // Also update |elements_| with the DOM structure.
504 for (size_t i = 0; i < params.size() && i < kMaxDomNodes; ++i) { 481 for (size_t i = 0; i < params.size() && i < kMaxDomNodes; ++i) {
505 SafeBrowsingHostMsg_ThreatDOMDetails_Node node = params[i]; 482 SafeBrowsingHostMsg_ThreatDOMDetails_Node node = params[i];
506 DVLOG(1) << node.url << ", " << node.tag_name << ", " << node.parent; 483 DVLOG(1) << node.url << ", " << node.tag_name << ", " << node.parent;
507 ClientSafeBrowsingReportRequest::Resource* resource = nullptr; 484 ClientSafeBrowsingReportRequest::Resource* resource = nullptr;
508 if (!node.url.is_empty()) { 485 if (!node.url.is_empty()) {
509 resource = AddUrl(node.url, node.parent, node.tag_name, &(node.children)); 486 resource = AddUrl(node.url, node.parent, node.tag_name, &(node.children));
510 } 487 }
511 // Check for a tag_name to avoid adding the summary node to the DOM. 488 // Check for a tag_name to avoid adding the summary node to the DOM.
512 if (!node.tag_name.empty()) { 489 if (!node.tag_name.empty()) {
513 AddDomElement(frame_tree_node_id, frame_url.spec(), node.node_id, 490 AddDomElement(process_id, frame_tree_node_id, node.child_frame_routing_id,
514 node.tag_name, node.parent_node_id, node.attributes, 491 node.node_id, node.tag_name, node.parent_node_id,
515 resource); 492 node.attributes, resource);
516 } 493 }
517 } 494 }
518 } 495 }
519 496
520 // Called from the SB Service on the IO thread, after the user has 497 // Called from the SB Service on the IO thread, after the user has
521 // closed the tab, or clicked proceed or goback. Since the user needs 498 // closed the tab, or clicked proceed or goback. Since the user needs
522 // to take an action, we expect this to be called after 499 // to take an action, we expect this to be called after
523 // OnReceivedThreatDOMDetails in most cases. If not, we don't include 500 // OnReceivedThreatDOMDetails in most cases. If not, we don't include
524 // the DOM data in our report. 501 // the DOM data in our report.
525 void ThreatDetails::FinishCollection(bool did_proceed, int num_visit) { 502 void ThreatDetails::FinishCollection(bool did_proceed, int num_visit) {
526 DCHECK_CURRENTLY_ON(BrowserThread::IO); 503 DCHECK_CURRENTLY_ON(BrowserThread::IO);
527 504
505 // Do a second pass over the elements and update iframe elements to have
Charlie Reis 2017/05/10 22:17:49 Side note: I don't fully understand this second pa
lpz 2017/05/12 13:53:16 Yes, this is tested by ThreatDetailsTest.ThreatDOM
Charlie Reis 2017/05/12 21:40:50 Acknowledged.
506 // references to their children. Children may have been received from a
507 // different renderer than the iframe element.
508 for (auto& element_pair : elements_) {
509 const std::string& element_key = element_pair.first;
510 HTMLElement* element = element_pair.second.get();
511 if (base::ContainsKey(iframe_key_to_frame_tree_id_map_, element_key)) {
512 int frame_tree_id_of_iframe_renderer =
513 iframe_key_to_frame_tree_id_map_[element_key];
514 const std::unordered_set<int>& child_ids =
515 frame_tree_id_to_children_map_[frame_tree_id_of_iframe_renderer];
516 for (const int child_id : child_ids) {
517 element->add_child_ids(child_id);
518 }
519 }
520 }
528 did_proceed_ = did_proceed; 521 did_proceed_ = did_proceed;
529 num_visits_ = num_visit; 522 num_visits_ = num_visit;
530 std::vector<GURL> urls; 523 std::vector<GURL> urls;
531 for (ResourceMap::const_iterator it = resources_.begin(); 524 for (ResourceMap::const_iterator it = resources_.begin();
532 it != resources_.end(); ++it) { 525 it != resources_.end(); ++it) {
533 urls.push_back(GURL(it->first)); 526 urls.push_back(GURL(it->first));
534 } 527 }
535 redirects_collector_->StartHistoryCollection( 528 redirects_collector_->StartHistoryCollection(
536 urls, base::Bind(&ThreatDetails::OnRedirectionCollectionReady, this)); 529 urls, base::Bind(&ThreatDetails::OnRedirectionCollectionReady, this));
537 } 530 }
(...skipping 55 matching lines...) Expand 10 before | Expand all | Expand 10 after
593 // Send the report, using the SafeBrowsingService. 586 // Send the report, using the SafeBrowsingService.
594 std::string serialized; 587 std::string serialized;
595 if (!report_->SerializeToString(&serialized)) { 588 if (!report_->SerializeToString(&serialized)) {
596 DLOG(ERROR) << "Unable to serialize the threat report."; 589 DLOG(ERROR) << "Unable to serialize the threat report.";
597 return; 590 return;
598 } 591 }
599 ui_manager_->SendSerializedThreatDetails(serialized); 592 ui_manager_->SendSerializedThreatDetails(serialized);
600 } 593 }
601 594
602 } // namespace safe_browsing 595 } // namespace safe_browsing
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698