Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(333)

Side by Side Diff: chrome/browser/safe_browsing/threat_details.cc

Issue 2837603002: Content API changes to improve DOM stitching in ThreatDetails code. (Closed)
Patch Set: Remove unnecessary deps Created 3 years, 8 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 // 4 //
5 // Implementation of the ThreatDetails class. 5 // Implementation of the ThreatDetails class.
6 6
7 #include "chrome/browser/safe_browsing/threat_details.h" 7 #include "chrome/browser/safe_browsing/threat_details.h"
8 8
9 #include <stddef.h> 9 #include <stddef.h>
10 #include <stdint.h> 10 #include <stdint.h>
11 11
12 #include "base/bind.h" 12 #include "base/bind.h"
13 #include "base/lazy_instance.h" 13 #include "base/lazy_instance.h"
14 #include "base/metrics/histogram_macros.h" 14 #include "base/metrics/histogram_macros.h"
15 #include "base/strings/string_util.h" 15 #include "base/strings/string_util.h"
16 #include "chrome/browser/safe_browsing/threat_details_cache.h" 16 #include "chrome/browser/safe_browsing/threat_details_cache.h"
17 #include "chrome/browser/safe_browsing/threat_details_history.h" 17 #include "chrome/browser/safe_browsing/threat_details_history.h"
18 #include "components/history/core/browser/history_service.h" 18 #include "components/history/core/browser/history_service.h"
19 #include "components/safe_browsing/base_ui_manager.h" 19 #include "components/safe_browsing/base_ui_manager.h"
20 #include "components/safe_browsing/common/safebrowsing_messages.h" 20 #include "components/safe_browsing/common/safebrowsing_messages.h"
21 #include "content/public/browser/browser_thread.h" 21 #include "content/public/browser/browser_thread.h"
22 #include "content/public/browser/navigation_controller.h" 22 #include "content/public/browser/navigation_controller.h"
23 #include "content/public/browser/navigation_entry.h" 23 #include "content/public/browser/navigation_entry.h"
24 #include "content/public/browser/render_frame_host.h" 24 #include "content/public/browser/render_frame_host.h"
25 #include "content/public/browser/render_process_host.h"
25 #include "content/public/browser/web_contents.h" 26 #include "content/public/browser/web_contents.h"
26 #include "net/url_request/url_request_context_getter.h" 27 #include "net/url_request/url_request_context_getter.h"
27 28
28 using content::BrowserThread; 29 using content::BrowserThread;
29 using content::NavigationEntry; 30 using content::NavigationEntry;
30 using content::RenderFrameHost; 31 using content::RenderFrameHost;
31 using content::WebContents; 32 using content::WebContents;
32 33
33 // Keep in sync with KMaxNodes in components/safe_browsing/renderer/ 34 // Keep in sync with KMaxNodes in components/safe_browsing/renderer/
34 // threat_dom_details.cc 35 // threat_dom_details.cc
(...skipping 143 matching lines...) Expand 10 before | Expand all | Expand 10 after
178 const UnsafeResource& resource, 179 const UnsafeResource& resource,
179 net::URLRequestContextGetter* request_context_getter, 180 net::URLRequestContextGetter* request_context_getter,
180 history::HistoryService* history_service) 181 history::HistoryService* history_service)
181 : content::WebContentsObserver(web_contents), 182 : content::WebContentsObserver(web_contents),
182 request_context_getter_(request_context_getter), 183 request_context_getter_(request_context_getter),
183 ui_manager_(ui_manager), 184 ui_manager_(ui_manager),
184 resource_(resource), 185 resource_(resource),
185 cache_result_(false), 186 cache_result_(false),
186 did_proceed_(false), 187 did_proceed_(false),
187 num_visits_(0), 188 num_visits_(0),
188 ambiguous_dom_(false),
189 cache_collector_(new ThreatDetailsCacheCollector) { 189 cache_collector_(new ThreatDetailsCacheCollector) {
190 redirects_collector_ = new ThreatDetailsRedirectsCollector( 190 redirects_collector_ = new ThreatDetailsRedirectsCollector(
191 history_service ? history_service->AsWeakPtr() 191 history_service ? history_service->AsWeakPtr()
192 : base::WeakPtr<history::HistoryService>()); 192 : base::WeakPtr<history::HistoryService>());
193 StartCollection(); 193 StartCollection();
194 } 194 }
195 195
196 ThreatDetails::~ThreatDetails() {} 196 ThreatDetails::~ThreatDetails() {}
197 197
198 bool ThreatDetails::OnMessageReceived(const IPC::Message& message, 198 bool ThreatDetails::OnMessageReceived(const IPC::Message& message,
(...skipping 80 matching lines...) Expand 10 before | Expand all | Expand 10 after
279 } 279 }
280 if (!duplicate_child) 280 if (!duplicate_child)
281 url_resource->add_child_ids(child_resource->id()); 281 url_resource->add_child_ids(child_resource->id());
282 } 282 }
283 } 283 }
284 284
285 return url_resource; 285 return url_resource;
286 } 286 }
287 287
288 void ThreatDetails::AddDomElement( 288 void ThreatDetails::AddDomElement(
289 const int process_id,
289 const int frame_tree_node_id, 290 const int frame_tree_node_id,
290 const std::string& frame_url, 291 const int other_frame_routing_id,
291 const int element_node_id, 292 const int element_node_id,
292 const std::string& tagname, 293 const std::string& tagname,
293 const int parent_element_node_id, 294 const int parent_element_node_id,
294 const std::vector<AttributeNameValue>& attributes, 295 const std::vector<AttributeNameValue>& attributes,
295 const ClientSafeBrowsingReportRequest::Resource* resource) { 296 const ClientSafeBrowsingReportRequest::Resource* resource) {
296 // Create the element. It should not exist already since this function should 297 // Create the element. It should not exist already since this function should
297 // only be called once for each element. 298 // only be called once for each element.
298 const std::string element_key = 299 const std::string element_key =
299 GetElementKey(frame_tree_node_id, element_node_id); 300 GetElementKey(frame_tree_node_id, element_node_id);
300 HTMLElement* cur_element = FindOrCreateElement(element_key); 301 HTMLElement* cur_element = FindOrCreateElement(element_key);
301 302
302 // Set some basic metadata about the element. 303 // Set some basic metadata about the element.
303 const std::string tag_name_upper = base::ToUpperASCII(tagname); 304 const std::string tag_name_upper = base::ToUpperASCII(tagname);
304 if (!tag_name_upper.empty()) { 305 if (!tag_name_upper.empty()) {
305 cur_element->set_tag(tag_name_upper); 306 cur_element->set_tag(tag_name_upper);
306 } 307 }
307 for (const AttributeNameValue& attribute : attributes) { 308 for (const AttributeNameValue& attribute : attributes) {
308 HTMLElement::Attribute* attribute_pb = cur_element->add_attribute(); 309 HTMLElement::Attribute* attribute_pb = cur_element->add_attribute();
309 attribute_pb->set_name(attribute.first); 310 attribute_pb->set_name(attribute.first);
310 attribute_pb->set_value(attribute.second); 311 attribute_pb->set_value(attribute.second);
311 } 312 }
312 bool is_frame = tag_name_upper == "IFRAME" || tag_name_upper == "FRAME"; 313 bool is_frame = tag_name_upper == "IFRAME" || tag_name_upper == "FRAME";
313 314
314 if (resource) { 315 if (resource) {
315 cur_element->set_resource_id(resource->id()); 316 cur_element->set_resource_id(resource->id());
316 317
317 // For iframes, remember that this HTML Element represents an iframe with a 318 // For iframes, lookup the frame tree node id of the render frame that
318 // specific URL. Elements from a frame with this URL are children of this 319 // handled that iframe's content. This must be done on the UI thread, and
319 // element. 320 // will update a map of |element_key| to |frame_tree_node_id|. A second pass
320 if (is_frame && 321 // is done to update the |elements_| list using this mapping.
321 !base::ContainsKey(iframe_src_to_element_map_, resource->url())) { 322 if (is_frame) {
322 iframe_src_to_element_map_[resource->url()] = cur_element; 323 BrowserThread::PostTask(
324 BrowserThread::UI, FROM_HERE,
325 base::Bind(&ThreatDetails::LookupOtherFrameId, this, element_key,
326 process_id, other_frame_routing_id));
323 } 327 }
324 } 328 }
325 329
326 // Next we try to lookup the parent of the current element and add ourselves 330 // Next we try to lookup the parent of the current element and add ourselves
327 // as a child of it. 331 // as a child of it.
328 HTMLElement* parent_element = nullptr; 332 HTMLElement* parent_element = nullptr;
329 if (parent_element_node_id == 0) { 333 if (parent_element_node_id == 0) {
330 // No parent indicates that this element is at the top of the current frame. 334 // No parent indicates that this element is at the top of the current frame.
331 // This frame could be a child of an iframe in another frame, or it could be 335 // Remember that this is a top-level element of the frame with the
332 // at the root of the whole page. If we have a frame URL then we can try to 336 // current |frame_tree_node_id|. If this element is inside an iframe, a
333 // map this element to its parent. 337 // second pass will insert this element as a child of its parent iframe.
334 if (!frame_url.empty()) { 338 frame_tree_id_to_children_map_[frame_tree_node_id].insert(
335 // First, remember that this element is at the top-level of a frame with 339 cur_element->id());
336 // our frame URL.
337 document_url_to_children_map_[frame_url].insert(cur_element->id());
338
339 // Now check if the frame URL matches the src URL of an iframe elsewhere.
340 // This means that we processed the parent iframe element earlier, so we
341 // can add ourselves as a child of that iframe.
342 // If no such iframe exists, it could be processed later, or this element
343 // is in the top-level frame and truly has no parent.
344 if (base::ContainsKey(iframe_src_to_element_map_, frame_url)) {
345 parent_element = iframe_src_to_element_map_[frame_url];
346 }
347 }
348 } else { 340 } else {
349 // We have a parent ID, so this element is just a child of something inside 341 // We have a parent ID, so this element is just a child of something inside
350 // of our current frame. We can easily lookup our parent. 342 // of our current frame. We can easily lookup our parent.
351 const std::string& parent_key = 343 const std::string& parent_key =
352 GetElementKey(frame_tree_node_id, parent_element_node_id); 344 GetElementKey(frame_tree_node_id, parent_element_node_id);
353 if (base::ContainsKey(elements_, parent_key)) { 345 if (base::ContainsKey(elements_, parent_key)) {
354 parent_element = elements_[parent_key].get(); 346 parent_element = elements_[parent_key].get();
355 } 347 }
356 } 348 }
357 349
358 // If a parent element was found, add ourselves as a child, ensuring not to 350 // If a parent element was found, add ourselves as a child, ensuring not to
359 // duplicate child IDs. 351 // duplicate child IDs.
360 if (parent_element) { 352 if (parent_element) {
361 bool duplicate_child = false; 353 bool duplicate_child = false;
362 for (const int child_id : parent_element->child_ids()) { 354 for (const int child_id : parent_element->child_ids()) {
363 if (child_id == cur_element->id()) { 355 if (child_id == cur_element->id()) {
364 duplicate_child = true; 356 duplicate_child = true;
365 break; 357 break;
366 } 358 }
367 } 359 }
368 if (!duplicate_child) { 360 if (!duplicate_child) {
369 parent_element->add_child_ids(cur_element->id()); 361 parent_element->add_child_ids(cur_element->id());
370 } 362 }
371 } 363 }
372
373 // Finally, we need to check if the current element is the parent of some
374 // other elements that came in from another frame earlier. This only happens
375 // if we are an iframe, and our src URL exists in
376 // document_url_to_children_map_. If there is a match, then all of the
377 // children in that map belong to us.
378 if (is_frame && resource &&
379 base::ContainsKey(document_url_to_children_map_, resource->url())) {
380 const std::unordered_set<int>& child_ids =
381 document_url_to_children_map_[resource->url()];
382 for (const int child_id : child_ids) {
383 cur_element->add_child_ids(child_id);
384 }
385 }
386 } 364 }
387 365
388 void ThreatDetails::StartCollection() { 366 void ThreatDetails::StartCollection() {
389 DVLOG(1) << "Starting to compute threat details."; 367 DVLOG(1) << "Starting to compute threat details.";
390 report_.reset(new ClientSafeBrowsingReportRequest()); 368 report_.reset(new ClientSafeBrowsingReportRequest());
391 369
392 if (IsReportableUrl(resource_.url)) { 370 if (IsReportableUrl(resource_.url)) {
393 report_->set_url(resource_.url.spec()); 371 report_->set_url(resource_.url.spec());
394 report_->set_type(GetReportTypeFromSBThreatType(resource_.threat_type)); 372 report_->set_type(GetReportTypeFromSBThreatType(resource_.threat_type));
395 } 373 }
(...skipping 53 matching lines...) Expand 10 before | Expand all | Expand 10 after
449 } 427 }
450 428
451 // When the renderer is done, this is called. 429 // When the renderer is done, this is called.
452 void ThreatDetails::OnReceivedThreatDOMDetails( 430 void ThreatDetails::OnReceivedThreatDOMDetails(
453 content::RenderFrameHost* sender, 431 content::RenderFrameHost* sender,
454 const std::vector<SafeBrowsingHostMsg_ThreatDOMDetails_Node>& params) { 432 const std::vector<SafeBrowsingHostMsg_ThreatDOMDetails_Node>& params) {
455 // Schedule this in IO thread, so it doesn't conflict with future users 433 // Schedule this in IO thread, so it doesn't conflict with future users
456 // of our data structures (eg GetSerializedReport). 434 // of our data structures (eg GetSerializedReport).
457 BrowserThread::PostTask( 435 BrowserThread::PostTask(
458 BrowserThread::IO, FROM_HERE, 436 BrowserThread::IO, FROM_HERE,
459 base::BindOnce(&ThreatDetails::AddDOMDetails, this, 437 base::Bind(&ThreatDetails::AddDOMDetails, this,
460 sender->GetFrameTreeNodeId(), 438 sender->GetProcess()->GetID(), sender->GetFrameTreeNodeId(),
461 sender->GetLastCommittedURL(), params)); 439 sender->GetLastCommittedURL(), params));
462 } 440 }
463 441
464 void ThreatDetails::AddDOMDetails( 442 void ThreatDetails::AddDOMDetails(
443 const int process_id,
465 const int frame_tree_node_id, 444 const int frame_tree_node_id,
466 const GURL& frame_last_committed_url, 445 const GURL& frame_last_committed_url,
467 const std::vector<SafeBrowsingHostMsg_ThreatDOMDetails_Node>& params) { 446 const std::vector<SafeBrowsingHostMsg_ThreatDOMDetails_Node>& params) {
468 DCHECK_CURRENTLY_ON(BrowserThread::IO); 447 DCHECK_CURRENTLY_ON(BrowserThread::IO);
469 DVLOG(1) << "Nodes from the DOM: " << params.size(); 448 DVLOG(1) << "Nodes from the DOM: " << params.size();
470 449
471 // If we have already started getting redirects from history service, 450 // If we have already started getting redirects from history service,
472 // don't modify state, otherwise will invalidate the iterators. 451 // don't modify state, otherwise will invalidate the iterators.
473 if (redirects_collector_->HasStarted()) 452 if (redirects_collector_->HasStarted())
474 return; 453 return;
475 454
476 // If we have already started collecting data from the HTTP cache, don't 455 // If we have already started collecting data from the HTTP cache, don't
477 // modify our state. 456 // modify our state.
478 if (cache_collector_->HasStarted()) 457 if (cache_collector_->HasStarted())
479 return; 458 return;
480 459
481 // Exit early if there are no nodes to process. 460 // Exit early if there are no nodes to process.
482 if (params.empty()) 461 if (params.empty())
483 return; 462 return;
484 463
485 // Try to deduce the URL that the render frame was handling. First check if
486 // the summary node from the renderer has a document URL. If not, try looking
487 // at the last committed URL of the frame.
488 GURL frame_url;
489 if (IsReportableUrl(params.back().url)) {
490 frame_url = params.back().url;
491 } else if (IsReportableUrl(frame_last_committed_url)) {
492 frame_url = frame_last_committed_url;
493 }
494
495 // If we can't figure out which URL the frame was rendering then we don't know
496 // where these elements belong in the hierarchy. The DOM will be ambiguous.
497 if (frame_url.is_empty()) {
498 ambiguous_dom_ = true;
499 }
500
501 // Add the urls from the DOM to |resources_|. The renderer could be sending 464 // Add the urls from the DOM to |resources_|. The renderer could be sending
502 // bogus messages, so limit the number of nodes we accept. 465 // bogus messages, so limit the number of nodes we accept.
503 // Also update |elements_| with the DOM structure. 466 // Also update |elements_| with the DOM structure.
504 for (size_t i = 0; i < params.size() && i < kMaxDomNodes; ++i) { 467 for (size_t i = 0; i < params.size() && i < kMaxDomNodes; ++i) {
505 SafeBrowsingHostMsg_ThreatDOMDetails_Node node = params[i]; 468 SafeBrowsingHostMsg_ThreatDOMDetails_Node node = params[i];
506 DVLOG(1) << node.url << ", " << node.tag_name << ", " << node.parent; 469 DVLOG(1) << node.url << ", " << node.tag_name << ", " << node.parent;
507 ClientSafeBrowsingReportRequest::Resource* resource = nullptr; 470 ClientSafeBrowsingReportRequest::Resource* resource = nullptr;
508 if (!node.url.is_empty()) { 471 if (!node.url.is_empty()) {
509 resource = AddUrl(node.url, node.parent, node.tag_name, &(node.children)); 472 resource = AddUrl(node.url, node.parent, node.tag_name, &(node.children));
510 } 473 }
511 // Check for a tag_name to avoid adding the summary node to the DOM. 474 // Check for a tag_name to avoid adding the summary node to the DOM.
512 if (!node.tag_name.empty()) { 475 if (!node.tag_name.empty()) {
513 AddDomElement(frame_tree_node_id, frame_url.spec(), node.node_id, 476 AddDomElement(process_id, frame_tree_node_id, node.other_frame_routing_id,
514 node.tag_name, node.parent_node_id, node.attributes, 477 node.node_id, node.tag_name, node.parent_node_id,
515 resource); 478 node.attributes, resource);
516 } 479 }
517 } 480 }
518 } 481 }
519 482
483 void ThreatDetails::LookupOtherFrameId(const std::string& element_key,
484 const int process_id,
485 const int other_frame_routing_id) {
486 DCHECK_CURRENTLY_ON(BrowserThread::UI);
487 int other_frame_tree_node_id =
488 content::RenderFrameHost::LookupOtherFrameTreeNodeId(
489 process_id, other_frame_routing_id);
490 iframe_key_to_frame_tree_id_map_[element_key] = other_frame_tree_node_id;
491 }
492
520 // Called from the SB Service on the IO thread, after the user has 493 // Called from the SB Service on the IO thread, after the user has
521 // closed the tab, or clicked proceed or goback. Since the user needs 494 // closed the tab, or clicked proceed or goback. Since the user needs
522 // to take an action, we expect this to be called after 495 // to take an action, we expect this to be called after
523 // OnReceivedThreatDOMDetails in most cases. If not, we don't include 496 // OnReceivedThreatDOMDetails in most cases. If not, we don't include
524 // the DOM data in our report. 497 // the DOM data in our report.
525 void ThreatDetails::FinishCollection(bool did_proceed, int num_visit) { 498 void ThreatDetails::FinishCollection(bool did_proceed, int num_visit) {
526 DCHECK_CURRENTLY_ON(BrowserThread::IO); 499 DCHECK_CURRENTLY_ON(BrowserThread::IO);
527 500
501 // Do a second pass over the elements and update iframe elements to have
502 // references to their children. Children will have been received from a
503 // different renderer than the iframe element.
504 for (auto& element_pair : elements_) {
505 const std::string& element_key = element_pair.first;
506 HTMLElement* element = element_pair.second.get();
507 if (element->tag() == "IFRAME" || element->tag() == "FRAME") {
508 int frame_tree_id_of_iframe_renderer =
509 iframe_key_to_frame_tree_id_map_[element_key];
510 const std::unordered_set<int>& child_ids =
511 frame_tree_id_to_children_map_[frame_tree_id_of_iframe_renderer];
512 for (const int child_id : child_ids) {
513 element->add_child_ids(child_id);
514 }
515 }
516 }
528 did_proceed_ = did_proceed; 517 did_proceed_ = did_proceed;
529 num_visits_ = num_visit; 518 num_visits_ = num_visit;
530 std::vector<GURL> urls; 519 std::vector<GURL> urls;
531 for (ResourceMap::const_iterator it = resources_.begin(); 520 for (ResourceMap::const_iterator it = resources_.begin();
532 it != resources_.end(); ++it) { 521 it != resources_.end(); ++it) {
533 urls.push_back(GURL(it->first)); 522 urls.push_back(GURL(it->first));
534 } 523 }
535 redirects_collector_->StartHistoryCollection( 524 redirects_collector_->StartHistoryCollection(
536 urls, base::Bind(&ThreatDetails::OnRedirectionCollectionReady, this)); 525 urls, base::Bind(&ThreatDetails::OnRedirectionCollectionReady, this));
537 } 526 }
(...skipping 31 matching lines...) Expand 10 before | Expand all | Expand 10 after
569 // Sanitize the HTTPS resource by clearing out private data (like cookie 558 // Sanitize the HTTPS resource by clearing out private data (like cookie
570 // headers). 559 // headers).
571 DVLOG(1) << "Clearing out HTTPS resource: " << pb_resource->url(); 560 DVLOG(1) << "Clearing out HTTPS resource: " << pb_resource->url();
572 ClearHttpsResource(pb_resource); 561 ClearHttpsResource(pb_resource);
573 // Keep id, parent_id, child_ids, and tag_name. 562 // Keep id, parent_id, child_ids, and tag_name.
574 } 563 }
575 } 564 }
576 for (auto& element_pair : elements_) { 565 for (auto& element_pair : elements_) {
577 report_->add_dom()->Swap(element_pair.second.get()); 566 report_->add_dom()->Swap(element_pair.second.get());
578 } 567 }
579 if (!elements_.empty()) {
580 // TODO(lpz): Consider including the ambiguous_dom_ bit in the report
581 // itself.
582 UMA_HISTOGRAM_BOOLEAN("SafeBrowsing.ThreatReport.DomIsAmbiguous",
583 ambiguous_dom_);
584 }
585 568
586 report_->set_did_proceed(did_proceed_); 569 report_->set_did_proceed(did_proceed_);
587 // Only sets repeat_visit if num_visits_ >= 0. 570 // Only sets repeat_visit if num_visits_ >= 0.
588 if (num_visits_ >= 0) { 571 if (num_visits_ >= 0) {
589 report_->set_repeat_visit(num_visits_ > 0); 572 report_->set_repeat_visit(num_visits_ > 0);
590 } 573 }
591 report_->set_complete(cache_result_); 574 report_->set_complete(cache_result_);
592 575
593 // Send the report, using the SafeBrowsingService. 576 // Send the report, using the SafeBrowsingService.
594 std::string serialized; 577 std::string serialized;
595 if (!report_->SerializeToString(&serialized)) { 578 if (!report_->SerializeToString(&serialized)) {
596 DLOG(ERROR) << "Unable to serialize the threat report."; 579 DLOG(ERROR) << "Unable to serialize the threat report.";
597 return; 580 return;
598 } 581 }
599 ui_manager_->SendSerializedThreatDetails(serialized); 582 ui_manager_->SendSerializedThreatDetails(serialized);
600 } 583 }
601 584
602 } // namespace safe_browsing 585 } // namespace safe_browsing
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698