Index: chrome/common/process_watcher_mac.cc |
=================================================================== |
--- chrome/common/process_watcher_mac.cc (revision 46468) |
+++ chrome/common/process_watcher_mac.cc (working copy) |
@@ -16,105 +16,155 @@ |
namespace { |
-// Reap |child| process. |
-// This call blocks until completion. |
+const int kWaitBeforeKillSeconds = 2; |
+ |
+// Reap |child| process. This call blocks until completion. |
void BlockingReap(pid_t child) { |
const pid_t result = HANDLE_EINTR(waitpid(child, NULL, 0)); |
if (result == -1) { |
- PLOG(ERROR) << "waitpid(" << child << ")"; |
- NOTREACHED(); |
+ PLOG(ERROR) << "waitpid(" << child << ", NULL, 0)"; |
} |
} |
-// Waits for |timeout| seconds for the given |child| to exit and reap it. |
-// If the child doesn't exit within a couple of seconds, kill it. |
-void WaitForChildToDie(pid_t child, unsigned timeout) { |
+// Waits for |timeout| seconds for the given |child| to exit and reap it. If |
+// the child doesn't exit within the time specified, kills it. |
+// |
+// This function takes two approaches: first, it tries to use kqueue to |
+// observe when the process exits. kevent can monitor a kqueue with a |
+// timeout, so this method is preferred to wait for a specified period of |
+// time. Once the kqueue indicates the process has exited, waitpid will reap |
+// the exited child. If the kqueue doesn't provide an exit event notification, |
+// before the timeout expires, or if the kqueue fails or misbehaves, the |
+// process will be mercilessly killed and reaped. |
+// |
+// A child process passed to this function may be in one of several states: |
+// running, terminated and not yet reaped, and (apparently, and unfortunately) |
+// terminated and already reaped. Normally, a process will at least have been |
+// asked to exit before this function is called, but this is not required. |
+// If a process is terminating and unreaped, there may be a window between the |
+// time that kqueue will no longer recognize it and when it becomes an actual |
+// zombie that a non-blocking (WNOHANG) waitpid can reap. This condition is |
+// detected when kqueue indicates that the process is not running and a |
+// non-blocking waitpid fails to reap the process but indicates that it is |
+// still running. In this event, a blocking attempt to reap the process |
+// collects the known-dying child, preventing zombies from congregating. |
+// |
+// In the event that the kqueue misbehaves entirely, as it might under a |
+// EMFILE condition ("too many open files", or out of file descriptors), this |
+// function will forcibly kill and reap the child without delay. This |
+// eliminates another potential zombie vector. (If you're out of file |
+// descriptors, you're probably deep into something else, but that doesn't |
+// mean that zombies be allowed to kick you while you're down.) |
+// |
+// The fact that this function seemingly can be called to wait on a child |
+// that's not only already terminated but already reaped is a bit of a |
+// problem: a reaped child's pid can be reclaimed and may refer to a distinct |
+// process in that case. The fact that this function can seemingly be called |
+// to wait on a process that's not even a child is also a problem: kqueue will |
+// work in that case, but waitpid won't, and killing a non-child might not be |
+// the best approach. |
+void WaitForChildToDie(pid_t child, int timeout) { |
+ DCHECK(child > 0); |
+ DCHECK(timeout > 0); |
+ |
+ // DON'T ADD ANY EARLY RETURNS TO THIS FUNCTION without ensuring that |
+ // |child| has been reaped. Specifically, even if a kqueue, kevent, or other |
+ // call fails, this function should fall back to the last resort of trying |
+ // to kill and reap the process. Not observing this rule will resurrect |
+ // zombies. |
+ |
+ int result; |
+ |
int kq = HANDLE_EINTR(kqueue()); |
- file_util::ScopedFD auto_close(&kq); |
if (kq == -1) { |
- PLOG(ERROR) << "Failed to create kqueue"; |
- return; |
- } |
+ PLOG(ERROR) << "kqueue()"; |
+ } else { |
+ file_util::ScopedFD auto_close_kq(&kq); |
- struct kevent event_to_add = {0}; |
- EV_SET(&event_to_add, child, EVFILT_PROC, EV_ADD, NOTE_EXIT, 0, NULL); |
- // Register interest with kqueue. |
- int result = HANDLE_EINTR(kevent(kq, &event_to_add, 1, NULL, 0, NULL)); |
- if (result == -1 && errno == ESRCH) { |
- // A "No Such Process" error is fine, the process may have died already |
- // and been reaped by someone else. But make sure that it was/is reaped. |
- // Don't report an error in case it was already reaped. |
- HANDLE_EINTR(waitpid(child, NULL, WNOHANG)); |
- return; |
- } |
+ struct kevent change = {0}; |
+ EV_SET(&change, child, EVFILT_PROC, EV_ADD, NOTE_EXIT, 0, NULL); |
+ result = HANDLE_EINTR(kevent(kq, &change, 1, NULL, 0, NULL)); |
- if (result == -1) { |
- PLOG(ERROR) << "Failed to register event to listen for death of pid " |
- << child; |
- return; |
- } |
+ if (result == -1) { |
+ if (errno != ESRCH) { |
+ PLOG(ERROR) << "kevent (setup " << child << ")"; |
+ } else { |
+ // At this point, one of the following has occurred: |
+ // 1. The process has died but has not yet been reaped. |
+ // 2. The process has died and has already been reaped. |
+ // 3. The process is in the process of dying. It's no longer |
+ // kqueueable, but it may not be waitable yet either. Mark calls |
+ // this case the "zombie death race". |
- struct kevent event = {0}; |
+ result = HANDLE_EINTR(waitpid(child, NULL, WNOHANG)); |
- DCHECK(timeout != 0); |
+ if (result != 0) { |
+ // A positive result indicates case 1. waitpid succeeded and reaped |
+ // the child. A result of -1 indicates case 2. The child has already |
+ // been reaped. In both of these cases, no further action is |
+ // necessary. |
+ return; |
+ } |
- int num_processes_that_died = -1; |
- using base::Time; |
- using base::TimeDelta; |
- // We need to keep track of the elapsed time - if kevent() returns |
- // EINTR in the middle of blocking call we want to make up what's left |
- // of the timeout. |
- TimeDelta time_left = TimeDelta::FromSeconds(timeout); |
- Time wakeup = Time::Now() + time_left; |
- while(time_left.InMilliseconds() > 0) { |
- const struct timespec timeout = time_left.ToTimeSpec(); |
- num_processes_that_died = kevent(kq, NULL, 0, &event, 1, &timeout); |
- if (num_processes_that_died >= 0) |
- break; |
- if (num_processes_that_died == -1 && errno == EINTR) { |
- time_left = wakeup - Time::Now(); |
- continue; |
- } |
+ // |result| is 0, indicating case 3. The process will be waitable in |
+ // short order. Fall back out of the kqueue code to kill it (for good |
+ // measure) and reap it. |
+ } |
+ } else { |
+ // Keep track of the elapsed time to be able to restart kevent if it's |
+ // interrupted. |
+ base::TimeDelta remaining_delta = base::TimeDelta::FromSeconds(timeout); |
+ base::Time deadline = base::Time::Now() + remaining_delta; |
+ result = -1; |
+ struct kevent event = {0}; |
+ while (remaining_delta.InMilliseconds() > 0) { |
+ const struct timespec remaining_timespec = remaining_delta.ToTimeSpec(); |
+ result = kevent(kq, NULL, 0, &event, 1, &remaining_timespec); |
+ if (result == -1 && errno == EINTR) { |
+ remaining_delta = deadline - base::Time::Now(); |
+ result = 0; |
+ } else { |
+ break; |
+ } |
+ } |
- // If we got here, kevent() must have returned -1. |
- PLOG(ERROR) << "kevent() failed"; |
- break; |
- } |
- |
- if (num_processes_that_died == -1) { |
- PLOG(ERROR) << "kevent failed"; |
- return; |
- } |
- if (num_processes_that_died == 1) { |
- if (event.fflags & NOTE_EXIT && |
- event.ident == static_cast<uintptr_t>(child)) { |
- // Process died, it's safe to make a blocking call here since the |
- // kqueue() notification occurs when the process is already zombified. |
- BlockingReap(child); |
- return; |
- } else { |
- PLOG(ERROR) << "kevent() returned unexpected result - ke.fflags =" |
- << event.fflags |
- << " ke.ident =" |
- << event.ident |
- << " while listening for pid=" |
- << child; |
+ if (result == -1) { |
+ PLOG(ERROR) << "kevent (wait " << child << ")"; |
+ } else if (result > 1) { |
+ LOG(ERROR) << "kevent (wait " << child << "): unexpected result " |
+ << result; |
+ } else if (result == 1) { |
+ if ((event.fflags & NOTE_EXIT) && |
+ (event.ident == static_cast<uintptr_t>(child))) { |
+ // The process is dead or dying. This won't block for long, if at |
+ // all. |
+ BlockingReap(child); |
+ return; |
+ } else { |
+ LOG(ERROR) << "kevent (wait " << child |
+ << "): unexpected event: fflags=" << event.fflags |
+ << ", ident=" << event.ident; |
+ } |
+ } |
} |
} |
- // If we got here the child is still alive so kill it... |
- if (kill(child, SIGKILL) == 0) { |
- // SIGKILL is uncatchable. Since the signal was delivered, we can |
- // just wait for the process to die now in a blocking manner. |
+ // The child is still alive, or is very freshly dead. Be sure by sending it |
+ // a signal. This is safe even if it's freshly dead, because it will be a |
+ // zombie (or on the way to zombiedom) and kill will return 0 even if the |
+ // signal is not delivered to a live process. |
+ result = kill(child, SIGKILL); |
+ if (result == -1) { |
+ PLOG(ERROR) << "kill(" << child << ", SIGKILL)"; |
+ } else { |
+ // The child is definitely on the way out now. BlockingReap won't need to |
+ // wait for long, if at all. |
BlockingReap(child); |
- } else { |
- PLOG(ERROR) << "While waiting for " << child << " to terminate we" |
- << " failed to deliver a SIGKILL signal"; |
} |
} |
} // namespace |
void ProcessWatcher::EnsureProcessTerminated(base::ProcessHandle process) { |
- WaitForChildToDie(process, 2); |
+ WaitForChildToDie(process, kWaitBeforeKillSeconds); |
} |