Chromium Code Reviews| Index: go/src/infra/tools/device_watchdog/main.go |
| diff --git a/go/src/infra/tools/device_watchdog/main.go b/go/src/infra/tools/device_watchdog/main.go |
| index af4835ea441e855b26e77f9e9a85af23231e345d..2b31837332879bdcf2081a0cfa611c7b8443896e 100644 |
| --- a/go/src/infra/tools/device_watchdog/main.go |
| +++ b/go/src/infra/tools/device_watchdog/main.go |
| @@ -17,10 +17,12 @@ package main |
| import "C" |
| import ( |
| + "errors" |
| "flag" |
| "fmt" |
| "io/ioutil" |
| "os" |
| + "os/exec" |
| "strconv" |
| "strings" |
| "syscall" |
| @@ -31,7 +33,14 @@ import ( |
| ) |
| var ( |
| - logHeader = C.CString("CIT_DeviceWatchdog") |
| + logHeader = C.CString("CIT_DeviceWatchdog") |
| + errTimeout = errors.New("timeout") |
| +) |
| + |
| +const ( |
| + stdInFd = 0 |
| + stdOutFd = 1 |
| + stdErrFd = 2 |
| ) |
| type logLevel int |
| @@ -42,12 +51,6 @@ const ( |
| logError |
| ) |
| -const ( |
| - stdInFd = 0 |
| - stdOutFd = 1 |
| - stdErrFd = 2 |
| -) |
| - |
| func (l logLevel) getLogLevel() C.int { |
| switch l { |
| case logInfo: |
| @@ -67,9 +70,8 @@ func logcatLog(level logLevel, format string, args ...interface{}) { |
| C.__android_log_write(level.getLogLevel(), logHeader, cmsg) |
| } |
| -// Spawn a child process via fork, create new process group, chdir and |
| -// redirect std in and out to /dev/null. |
| -func daemonize() (int, error) { |
| +// Spawn a child process via exec. |
| +func daemonize(maxUptime int) (int, error) { |
|
dnj
2016/09/03 00:48:07
WDYT about having daemonize take an args slice ins
bpastene
2016/09/09 00:51:43
Done, but I'd rather replace argv[0] with /proc/se
|
| ret, _, errno := syscall.Syscall(syscall.SYS_FORK, 0, 0, 0) |
|
dnj
2016/09/03 00:48:07
Forking here may be a little sketchy. In other dae
bpastene
2016/09/09 00:51:43
I was modeling it after https://chromium.googlesou
bpastene
2016/09/09 19:38:53
Nevermind. This is now all totally obsolete since
|
| pid := int(ret) |
| if errno != 0 { |
| @@ -78,12 +80,10 @@ func daemonize() (int, error) { |
| if pid > 0 { |
| return pid, nil |
| } |
| - |
| _, err := syscall.Setsid() |
| if err != nil { |
| return 0, err |
| } |
| - |
| f, err := os.Open("/dev/null") |
| if err != nil { |
| return 0, err |
| @@ -93,12 +93,27 @@ func daemonize() (int, error) { |
| syscall.Dup2(int(fd), stdOutFd) |
| syscall.Dup2(int(fd), stdErrFd) |
| - return pid, nil |
| + binary, err := os.Readlink("/proc/self/exe") |
| + if err != nil { |
| + return 0, err |
| + } |
| + cmd := exec.Command(binary, "--max-uptime", strconv.Itoa(maxUptime), "--skip-daemonize") |
| + err = cmd.Start() |
|
dnj
2016/09/03 00:48:07
You can avoid some junk by using "os.StartProcess"
bpastene
2016/09/09 00:51:43
I'm not sure what junk I'm avoiding, but done. (I
|
| + if err != nil { |
| + return 0, err |
| + } |
| + return cmd.Process.Pid, nil |
| +} |
| + |
| +type uptimeResult struct { |
| + Uptime time.Duration |
| + Err error |
| } |
| // Read from /proc/uptime. Expected format: |
| // "uptime_in_seconds cpu_idle_time_in_seconds" |
| -func getDeviceUptime() (time.Duration, error) { |
| +// Return the uptime via a channel for use with timeouts. |
| +func readUptime() (time.Duration, error) { |
| bytes, err := ioutil.ReadFile("/proc/uptime") |
| if err != nil { |
| return 0, fmt.Errorf("unable to open /proc/uptime: %s", err.Error()) |
| @@ -115,6 +130,28 @@ func getDeviceUptime() (time.Duration, error) { |
| return time.Duration(uptime * float64(time.Second)), nil |
| } |
| +func getUptime(requestQueue chan<- chan uptimeResult, timeoutPeriod time.Duration) (time.Duration, error) { |
|
dnj
2016/09/03 00:48:07
nit: chan<- chan<- uptimeResult
In other words, a
bpastene
2016/09/09 00:51:43
Done, but aren't I reading from those channels lat
dnj
2016/09/09 19:40:57
Yeah a bidirectional channel can be converted into
|
| + request := make(chan uptimeResult, 1) |
| + defer close(request) |
| + |
| + timer := time.NewTimer(timeoutPeriod) |
| + defer timer.Stop() |
| + |
| + select { |
| + case requestQueue <- request: |
| + break |
| + case <-timer.C: |
| + return 0, errTimeout |
| + } |
| + |
| + select { |
| + case resp := <-request: |
| + return resp.Uptime, resp.Err |
| + case <-timer.C: |
| + return 0, errTimeout |
| + } |
| +} |
| + |
| // Reboot device by writing to sysrq-trigger. See: |
| // https://www.kernel.org/doc/Documentation/sysrq.txt |
| func rebootDevice() error { |
| @@ -132,35 +169,61 @@ func rebootDevice() error { |
| func realMain() int { |
| maxUptimeFlag := flag.Int("max-uptime", 120, "Maximum uptime in minutes before a reboot is triggered.") |
| + skipDaemonizeFlag := flag.Bool("skip-daemonize", false, "Skips the daemonize logic. Otherwise it will spawn a copy daemon and exit.") |
| flag.Parse() |
| - os.Chdir("/") |
| - pid, err := daemonize() |
| - if err != nil { |
| - logcatLog(logError, "Failed to daemonize: %s", err.Error()) |
| - return 1 |
| - } |
| - if pid > 0 { |
| + if !*skipDaemonizeFlag { |
| + pid, err := daemonize(*maxUptimeFlag) |
| + if err != nil { |
| + logcatLog(logError, "Failed to daemonize: %s", err.Error()) |
| + return 1 |
| + } |
| logcatLog(logInfo, "Child spawned with pid %d, exiting parent\n", pid) |
| return 0 |
| } |
| + requestQueue := make(chan chan uptimeResult) |
| + go func() { |
| + for request := range requestQueue { |
| + uptime, err := readUptime() |
| + request <- uptimeResult{Uptime: uptime, Err: err} |
| + } |
| + }() |
| + defer close(requestQueue) |
| + |
| maxUptime := time.Duration(*maxUptimeFlag) * time.Minute |
| + consecutiveTimeouts := 0 |
| + maxTimeouts := 5 |
|
dnj
2016/09/03 00:48:07
nit: "const maxTimeouts = 5"
bpastene
2016/09/09 00:51:43
Done.
|
| for { |
| - uptime, err := getDeviceUptime() |
| - if err != nil { |
| + uptime, err := getUptime(requestQueue, 5*time.Second) |
| + switch err { |
| + case nil: |
| + consecutiveTimeouts = 0 |
| + case errTimeout: |
| + consecutiveTimeouts++ |
| + default: |
| logcatLog(logError, "Failed to get uptime: %s", err.Error()) |
| return 1 |
| } |
| + if consecutiveTimeouts == maxTimeouts { |
|
dnj
2016/09/03 00:48:07
nit: Just being paranoid, but might as well make t
bpastene
2016/09/09 00:51:43
Done.
|
| + logcatLog(logError, "%d consective timeouts when fetching uptime. Triggering reboot", maxTimeouts) |
|
dnj
2016/09/03 00:48:07
nit: currentTimeouts, not maxTimeouts.
bpastene
2016/09/09 00:51:43
Done.
|
| + break |
| + } else if consecutiveTimeouts > 0 { |
|
dnj
2016/09/03 00:48:07
nit: No need for "else if", since the previous sta
bpastene
2016/09/09 00:51:43
Done.
|
| + logcatLog(logError, "Timeout when fetching uptime. Sleeping for 60s and trying again.") |
| + time.Sleep(60 * time.Second) |
| + continue |
| + } |
| if uptime > maxUptime { |
| logcatLog(logInfo, "Max uptime exceeded: (%s > %s)\n", uptime, maxUptime) |
| break |
| } |
| logcatLog(logInfo, "No need to reboot, uptime < max_uptime: (%s < %s)\n", uptime, maxUptime) |
| + // Add an additional second to the sleep to ensure it doesn't |
| + // sleep several times in less than a second. |
| time.Sleep(maxUptime - uptime + time.Second) |
| } |
| - if err = rebootDevice(); err != nil { |
| + if err := rebootDevice(); err != nil { |
| logcatLog(logError, "Failed to reboot device: %s", err.Error()) |
| return 1 |
| } |