Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(83)

Side by Side Diff: chrome/browser/metrics/thread_watcher.cc

Issue 7134007: Added command line switches "crash-on-hang-threads" and "crash-on-hang-seconds" (Closed) Base URL: svn://chrome-svn/chrome/trunk/src/
Patch Set: '' Created 9 years, 6 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. 1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "base/string_tokenizer.h"
5 #include "base/threading/thread_restrictions.h" 6 #include "base/threading/thread_restrictions.h"
6 #include "build/build_config.h" 7 #include "build/build_config.h"
7 #include "chrome/browser/metrics/metrics_service.h" 8 #include "chrome/browser/metrics/metrics_service.h"
8 #include "chrome/browser/metrics/thread_watcher.h" 9 #include "chrome/browser/metrics/thread_watcher.h"
10 #include "chrome/common/chrome_switches.h"
9 #include "content/common/notification_service.h" 11 #include "content/common/notification_service.h"
10 12
11 #if defined(OS_WIN) 13 #if defined(OS_WIN)
12 #include <Objbase.h> 14 #include <Objbase.h>
13 #endif 15 #endif
14 16
15 // static 17 // static
16 const int ThreadWatcher::kPingCount = 6; 18 const int ThreadWatcher::kPingCount = 6;
17 19
18 // static
19 const int ThreadWatcher::kUnresponsiveCount = 6;
20
21 // ThreadWatcher methods and members. 20 // ThreadWatcher methods and members.
22 ThreadWatcher::ThreadWatcher(const BrowserThread::ID& thread_id, 21 ThreadWatcher::ThreadWatcher(const BrowserThread::ID& thread_id,
23 const std::string& thread_name, 22 const std::string& thread_name,
24 const base::TimeDelta& sleep_time, 23 const base::TimeDelta& sleep_time,
25 const base::TimeDelta& unresponsive_time) 24 const base::TimeDelta& unresponsive_time)
26 : thread_id_(thread_id), 25 : thread_id_(thread_id),
27 thread_name_(thread_name), 26 thread_name_(thread_name),
28 sleep_time_(sleep_time), 27 sleep_time_(sleep_time),
29 unresponsive_time_(unresponsive_time), 28 unresponsive_time_(unresponsive_time),
30 ping_time_(base::TimeTicks::Now()), 29 ping_time_(base::TimeTicks::Now()),
(...skipping 206 matching lines...) Expand 10 before | Expand all | Expand 10 after
237 236
238 void ThreadWatcher::ResetHangCounters() { 237 void ThreadWatcher::ResetHangCounters() {
239 DCHECK(WatchDogThread::CurrentlyOnWatchDogThread()); 238 DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
240 unresponsive_count_ = 0; 239 unresponsive_count_ = 0;
241 hung_processing_complete_ = false; 240 hung_processing_complete_ = false;
242 } 241 }
243 242
244 void ThreadWatcher::GotNoResponse() { 243 void ThreadWatcher::GotNoResponse() {
245 DCHECK(WatchDogThread::CurrentlyOnWatchDogThread()); 244 DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
246 245
247 // Record how other threads are responding when we don't get a response for 246 ++unresponsive_count_;
248 // ping message atleast kUnresponsiveCount times. 247
249 if (++unresponsive_count_ < kUnresponsiveCount) 248 // Check if the watched thread's unresponsiveness has gone over the limit.
249 if (ThreadWatcherList::IsResponsive(this))
jar (doing other things) 2011/06/10 00:38:33 This interface surprises me. We are asking a ques
ramant (doing other things) 2011/06/13 03:26:02 Done.
250 return; 250 return;
251 251
252 // Record total unresponsive_time since last pong message. 252 // Record total unresponsive_time since last pong message.
253 base::TimeDelta unresponse_time = base::TimeTicks::Now() - pong_time_; 253 base::TimeDelta unresponse_time = base::TimeTicks::Now() - pong_time_;
254 unresponsive_time_histogram_->AddTime(unresponse_time); 254 unresponsive_time_histogram_->AddTime(unresponse_time);
255 255
256 // We have already collected stats for the non-responding watched thread. 256 // We have already collected stats for the non-responding watched thread.
257 if (hung_processing_complete_) 257 if (hung_processing_complete_)
258 return; 258 return;
259 259
260 int no_of_responding_threads = 0; 260 int no_of_responding_threads = 0;
261 int no_of_unresponding_threads = 0; 261 int no_of_unresponding_threads = 0;
262 ThreadWatcherList::GetStatusOfThreads(&no_of_responding_threads, 262 ThreadWatcherList::GetStatusOfThreads(&no_of_responding_threads,
263 &no_of_unresponding_threads); 263 &no_of_unresponding_threads);
264 264
265 // Record how many watched threads are responding. 265 // Record how many watched threads are responding.
266 responsive_count_histogram_->Add(no_of_responding_threads); 266 responsive_count_histogram_->Add(no_of_responding_threads);
267 267
268 // Record how many watched threads are not responding. 268 // Record how many watched threads are not responding.
269 unresponsive_count_histogram_->Add(no_of_unresponding_threads); 269 unresponsive_count_histogram_->Add(no_of_unresponding_threads);
270 270
271 // Crash the browser if IO thread hasn't responded atleast kUnresponsiveCount 271 // Crash the browser if watched thread is in "--crash-on-hang-threads" command
jar (doing other things) 2011/06/10 00:38:33 This sentence seems to focus on the command line s
ramant (doing other things) 2011/06/13 03:26:02 Done.
272 // times and if the number of other threads is equal to 1. We picked 1 to 272 // line switch. We crash if the number of threads responding is equal to 1. We
273 // reduce the number of crashes and to get some sample data. 273 // picked 1 to reduce the number of crashes and to get some sample data.
274 if (thread_id_ == BrowserThread::IO && no_of_responding_threads == 1) { 274 if (no_of_responding_threads == 1 && ThreadWatcherList::CrashOnHang(this)) {
jar (doing other things) 2011/06/10 00:38:33 I'm not sure which is better.... but I suspect we
ramant (doing other things) 2011/06/13 03:26:02 Done.
275 int* crash = NULL; 275 int* crash = NULL;
276 CHECK(crash++); 276 CHECK(crash+thread_id_);
277 } 277 }
278 278
279 hung_processing_complete_ = true; 279 hung_processing_complete_ = true;
280 } 280 }
281 281
282 // ThreadWatcherList methods and members. 282 // ThreadWatcherList methods and members.
283 // 283 //
284 // static 284 // static
285 ThreadWatcherList* ThreadWatcherList::global_ = NULL; 285 ThreadWatcherList* ThreadWatcherList::global_ = NULL;
286 // static 286 // static
287 const int ThreadWatcherList::kSleepSeconds = 1; 287 const int ThreadWatcherList::kSleepSeconds = 1;
288 // static 288 // static
289 const int ThreadWatcherList::kUnresponsiveSeconds = 2; 289 const int ThreadWatcherList::kUnresponsiveSeconds = 2;
290 // static
291 const int ThreadWatcherList::kUnresponsiveCount = 6;
290 292
291 ThreadWatcherList::ThreadWatcherList() 293 ThreadWatcherList::ThreadWatcherList(const CommandLine& command_line)
292 : last_wakeup_time_(base::TimeTicks::Now()) { 294 : last_wakeup_time_(base::TimeTicks::Now()) {
293 // Assert we are not running on WATCHDOG thread. Would be ideal to assert we 295 // Assert we are not running on WATCHDOG thread. Would be ideal to assert we
294 // are on UI thread, but Unit tests are not running on UI thread. 296 // are on UI thread, but Unit tests are not running on UI thread.
295 DCHECK(!WatchDogThread::CurrentlyOnWatchDogThread()); 297 DCHECK(!WatchDogThread::CurrentlyOnWatchDogThread());
296 CHECK(!global_); 298 CHECK(!global_);
297 global_ = this; 299 global_ = this;
298 // Register Notifications observer. 300 // Register Notifications observer.
299 MetricsService::SetUpNotifications(&registrar_, this); 301 MetricsService::SetUpNotifications(&registrar_, this);
302
303 crash_on_unresponsive_count_ = kUnresponsiveCount;
304 std::string crash_on_hang_seconds =
305 command_line.GetSwitchValueASCII(switches::kCrashOnHangSeconds);
306 if (!crash_on_hang_seconds.empty()) {
307 int crash_seconds = atoi(crash_on_hang_seconds.c_str());
308 if (crash_seconds > 0)
309 crash_on_unresponsive_count_ = crash_seconds / kUnresponsiveSeconds;
jar (doing other things) 2011/06/10 00:38:33 I don't think you meant to scale it down. If you
ramant (doing other things) 2011/06/13 03:26:02 Done.
310 }
311
312 std::string crash_on_hang_threads =
313 command_line.GetSwitchValueASCII(switches::kCrashOnHangThreads);
314 if (crash_on_hang_threads.empty()) {
315 // Crash the browser if UI or IO threads are not responsive.
316 crash_on_hang_threads = "UI,IO";
jar (doing other things) 2011/06/10 00:38:33 You probably could use an early return here. Perh
ramant (doing other things) 2011/06/13 03:26:02 Wanted to add UI and IO threads (which we wanted t
317 }
318 StringTokenizer t(crash_on_hang_threads, ",");
319 while (t.GetNext()) {
320 std::string thread_name = t.token();
321 // We will ignore empty and duplicate thread_names.
jar (doing other things) 2011/06/10 00:38:33 You probably don't need to worry about dups (which
ramant (doing other things) 2011/06/13 03:26:02 Done.
322 if (!thread_name.empty())
323 crash_on_hang_thread_names_.insert(thread_name);
324 }
300 } 325 }
301 326
302 ThreadWatcherList::~ThreadWatcherList() { 327 ThreadWatcherList::~ThreadWatcherList() {
303 base::AutoLock auto_lock(lock_); 328 base::AutoLock auto_lock(lock_);
304 DCHECK(this == global_); 329 DCHECK(this == global_);
330 global_->crash_on_hang_thread_names_.clear();
jar (doing other things) 2011/06/10 00:38:33 I don't think you need to waste time clear()ing.
ramant (doing other things) 2011/06/13 03:26:02 Done.
305 global_ = NULL; 331 global_ = NULL;
306 } 332 }
307 333
308 // static 334 // static
309 void ThreadWatcherList::Register(ThreadWatcher* watcher) { 335 void ThreadWatcherList::Register(ThreadWatcher* watcher) {
310 if (!global_) 336 if (!global_)
311 return; 337 return;
312 base::AutoLock auto_lock(global_->lock_); 338 base::AutoLock auto_lock(global_->lock_);
313 DCHECK(!global_->PreLockedFind(watcher->thread_id())); 339 DCHECK(!global_->PreLockedFind(watcher->thread_id()));
314 global_->registered_[watcher->thread_id()] = watcher; 340 global_->registered_[watcher->thread_id()] = watcher;
(...skipping 62 matching lines...) Expand 10 before | Expand all | Expand 10 after
377 // Assert we are not running on WATCHDOG thread. Would be ideal to assert we 403 // Assert we are not running on WATCHDOG thread. Would be ideal to assert we
378 // are on UI thread, but Unit tests are not running on UI thread. 404 // are on UI thread, but Unit tests are not running on UI thread.
379 DCHECK(!WatchDogThread::CurrentlyOnWatchDogThread()); 405 DCHECK(!WatchDogThread::CurrentlyOnWatchDogThread());
380 if (!global_) 406 if (!global_)
381 return; 407 return;
382 base::AutoLock auto_lock(global_->lock_); 408 base::AutoLock auto_lock(global_->lock_);
383 global_->registrar_.RemoveAll(); 409 global_->registrar_.RemoveAll();
384 } 410 }
385 411
386 // static 412 // static
413 bool ThreadWatcherList::IsResponsive(ThreadWatcher* watcher) {
jar (doing other things) 2011/06/10 00:38:33 I'd rather see just the unresponsive_count() passe
ramant (doing other things) 2011/06/13 03:26:02 Moved this method into ThreadWatcher. Done.
414 DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
415 if (!global_)
416 return true;
417 if (watcher->unresponsive_count() < global_->crash_on_unresponsive_count_)
418 return true;
419 return false;
jar (doing other things) 2011/06/10 00:38:33 Except for unusual circumstances (where we plan to
ramant (doing other things) 2011/06/13 03:26:02 Moved this method into ThreadWatcher. Done.
420 }
421
422 // static
423 bool ThreadWatcherList::CrashOnHang(ThreadWatcher* watcher) {
jar (doing other things) 2011/06/10 00:38:33 I'd rather see this take a std::string thread_name
ramant (doing other things) 2011/06/13 03:26:02 Done.
424 DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
425 if (!global_)
426 return false;
427 if (IsResponsive(watcher))
428 return false;
429 std::set<std::string>::iterator it =
430 global_->crash_on_hang_thread_names_.find(watcher->thread_name());
431 if (it != global_->crash_on_hang_thread_names_.end())
432 return true;
433 return false;
jar (doing other things) 2011/06/10 00:38:33 return it != global_->crash_on_hang_thread_names_.
ramant (doing other things) 2011/06/13 03:26:02 Done.
434 }
435
436 // static
387 void ThreadWatcherList::GetStatusOfThreads(int* no_of_responding_threads, 437 void ThreadWatcherList::GetStatusOfThreads(int* no_of_responding_threads,
388 int* no_of_unresponding_threads) { 438 int* no_of_unresponding_threads) {
389 DCHECK(WatchDogThread::CurrentlyOnWatchDogThread()); 439 DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
390 *no_of_responding_threads = 0; 440 *no_of_responding_threads = 0;
391 *no_of_unresponding_threads = 0; 441 *no_of_unresponding_threads = 0;
392 if (!global_) 442 if (!global_)
393 return; 443 return;
394 444
395 base::AutoLock auto_lock(global_->lock_); 445 base::AutoLock auto_lock(global_->lock_);
396 for (RegistrationList::iterator it = global_->registered_.begin(); 446 for (RegistrationList::iterator it = global_->registered_.begin();
397 global_->registered_.end() != it; 447 global_->registered_.end() != it;
398 ++it) { 448 ++it) {
399 if (it->second->unresponsive_count_ < ThreadWatcher::kUnresponsiveCount) 449 if (it->second->unresponsive_count() <
450 global_->crash_on_unresponsive_count_)
jar (doing other things) 2011/06/10 00:38:33 FWIW: IF you restructured this (pushing the thresh
ramant (doing other things) 2011/06/13 03:26:02 Done.
400 ++(*no_of_responding_threads); 451 ++(*no_of_responding_threads);
401 else 452 else
402 ++(*no_of_unresponding_threads); 453 ++(*no_of_unresponding_threads);
403 } 454 }
404 } 455 }
405 456
406 void ThreadWatcherList::DeleteAll() { 457 void ThreadWatcherList::DeleteAll() {
407 DCHECK(WatchDogThread::CurrentlyOnWatchDogThread()); 458 DCHECK(WatchDogThread::CurrentlyOnWatchDogThread());
408 base::AutoLock auto_lock(lock_); 459 base::AutoLock auto_lock(lock_);
409 while (!registered_.empty()) { 460 while (!registered_.empty()) {
(...skipping 131 matching lines...) Expand 10 before | Expand all | Expand 10 after
541 watchdog_thread_ = NULL; 592 watchdog_thread_ = NULL;
542 } 593 }
543 594
544 void WatchDogThread::CleanUpAfterMessageLoopDestruction() { 595 void WatchDogThread::CleanUpAfterMessageLoopDestruction() {
545 #if defined(OS_WIN) 596 #if defined(OS_WIN)
546 // Closes the COM library on the current thread. CoInitialize must 597 // Closes the COM library on the current thread. CoInitialize must
547 // be balanced by a corresponding call to CoUninitialize. 598 // be balanced by a corresponding call to CoUninitialize.
548 CoUninitialize(); 599 CoUninitialize();
549 #endif 600 #endif
550 } 601 }
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698