Chromium Code Reviews| Index: go/src/infra/tools/device_watchdog/main.go |
| diff --git a/go/src/infra/tools/device_watchdog/main.go b/go/src/infra/tools/device_watchdog/main.go |
| index af4835ea441e855b26e77f9e9a85af23231e345d..7f01052e3999fae2dc3860529ba7d5ba66053c30 100644 |
| --- a/go/src/infra/tools/device_watchdog/main.go |
| +++ b/go/src/infra/tools/device_watchdog/main.go |
| @@ -21,9 +21,9 @@ import ( |
| "fmt" |
| "io/ioutil" |
| "os" |
| + "os/exec" |
| "strconv" |
| "strings" |
| - "syscall" |
| "time" |
| "unsafe" |
| @@ -42,12 +42,6 @@ const ( |
| logError |
| ) |
| -const ( |
| - stdInFd = 0 |
| - stdOutFd = 1 |
| - stdErrFd = 2 |
| -) |
| - |
| func (l logLevel) getLogLevel() C.int { |
| switch l { |
| case logInfo: |
| @@ -67,52 +61,46 @@ func logcatLog(level logLevel, format string, args ...interface{}) { |
| C.__android_log_write(level.getLogLevel(), logHeader, cmsg) |
| } |
| -// Spawn a child process via fork, create new process group, chdir and |
| -// redirect std in and out to /dev/null. |
| -func daemonize() (int, error) { |
| - ret, _, errno := syscall.Syscall(syscall.SYS_FORK, 0, 0, 0) |
| - pid := int(ret) |
| - if errno != 0 { |
| - return 0, errno |
| - } |
| - if pid > 0 { |
| - return pid, nil |
| - } |
| - |
| - _, err := syscall.Setsid() |
| +// Spawn a child process via exec. |
| +func daemonize(maxUptime int) (int, error) { |
|
dnj
2016/09/02 16:17:32
This isn't a full daemonize workflow. It doesn't s
bpastene
2016/09/02 23:02:13
There were problems with threads not correctly get
|
| + binary, err := os.Readlink("/proc/self/exe") |
| if err != nil { |
| return 0, err |
| } |
| - |
| - f, err := os.Open("/dev/null") |
| + cmd := exec.Command(binary, "--max-uptime", strconv.Itoa(maxUptime), "--skip-daemonize") |
| + err = cmd.Start() |
| if err != nil { |
| return 0, err |
| } |
| - fd := f.Fd() |
| - syscall.Dup2(int(fd), stdInFd) |
| - syscall.Dup2(int(fd), stdOutFd) |
| - syscall.Dup2(int(fd), stdErrFd) |
| + return cmd.Process.Pid, nil |
| +} |
| - return pid, nil |
| +type uptimeResult struct { |
| + Uptime time.Duration |
| + Err error |
| } |
| // Read from /proc/uptime. Expected format: |
| // "uptime_in_seconds cpu_idle_time_in_seconds" |
| -func getDeviceUptime() (time.Duration, error) { |
| +// Return the uptime via a channel for use with timeouts. |
| +func getDeviceUptime(c chan uptimeResult) { |
|
dnj
2016/09/02 16:17:32
Make this a directional channel:
c chan<- uptimeRe
bpastene
2016/09/02 23:02:13
Done.
|
| bytes, err := ioutil.ReadFile("/proc/uptime") |
| if err != nil { |
| - return 0, fmt.Errorf("unable to open /proc/uptime: %s", err.Error()) |
| + c <- uptimeResult{Uptime: 0, Err: fmt.Errorf("unable to open /proc/uptime: %s", err.Error())} |
| + return |
| } |
| // Split on the space to get uptime and drop cpu idle time. |
| uptimeFields := strings.Fields(string(bytes)) |
| if len(uptimeFields) == 0 { |
| - return 0, fmt.Errorf("unable to parse /proc/uptime") |
| + c <- uptimeResult{Uptime: 0, Err: fmt.Errorf("unable to parse /proc/uptime")} |
| + return |
| } |
| uptime, err := strconv.ParseFloat(uptimeFields[0], 64) |
| if err != nil { |
| - return 0, fmt.Errorf("unable to parse uptime: %s", err.Error()) |
| + c <- uptimeResult{Uptime: 0, Err: fmt.Errorf("unable to parse uptime: %s", err.Error())} |
| + return |
| } |
| - return time.Duration(uptime * float64(time.Second)), nil |
| + c <- uptimeResult{Uptime: time.Duration(uptime * float64(time.Second)), Err: nil} |
| } |
| // Reboot device by writing to sysrq-trigger. See: |
| @@ -132,35 +120,53 @@ func rebootDevice() error { |
| func realMain() int { |
| maxUptimeFlag := flag.Int("max-uptime", 120, "Maximum uptime in minutes before a reboot is triggered.") |
| + skipDaemonizeFlag := flag.Bool("skip-daemonize", false, "Skips the daemonize logic. Otherwise it will spawn a copy daemon and exit.") |
| flag.Parse() |
| - os.Chdir("/") |
| - pid, err := daemonize() |
| - if err != nil { |
| - logcatLog(logError, "Failed to daemonize: %s", err.Error()) |
| - return 1 |
| - } |
| - if pid > 0 { |
| + if !*skipDaemonizeFlag { |
| + pid, err := daemonize(*maxUptimeFlag) |
| + if err != nil { |
| + logcatLog(logError, "Failed to daemonize: %s", err.Error()) |
| + return 1 |
| + } |
| logcatLog(logInfo, "Child spawned with pid %d, exiting parent\n", pid) |
| return 0 |
| } |
| + var uptimeRes uptimeResult |
| maxUptime := time.Duration(*maxUptimeFlag) * time.Minute |
| + consecutiveTimeouts := 0 |
| + maxTimeouts := 5 |
| for { |
| - uptime, err := getDeviceUptime() |
| - if err != nil { |
| - logcatLog(logError, "Failed to get uptime: %s", err.Error()) |
| + c := make(chan uptimeResult, 1) |
| + go getDeviceUptime(c) |
|
bpastene
2016/09/02 00:05:22
Uses a go routine + select statement to add a time
dnj
2016/09/02 16:17:32
Select is correct, but you launch a new goroutine
bpastene
2016/09/02 23:02:13
Thanks for the suggestion Dan. I added the single
|
| + select { |
| + case uptimeRes = <-c: |
| + consecutiveTimeouts = 0 |
| + case <-time.After(5 * time.Second): |
| + consecutiveTimeouts++ |
| + } |
| + if consecutiveTimeouts == maxTimeouts { |
| + logcatLog(logError, "%d consective timeouts when fetching uptime. Triggering reboot", maxTimeouts) |
| + break |
| + } else if consecutiveTimeouts > 0 { |
| + logcatLog(logError, "Timeout when fetching uptime. Sleeping for 60s and trying again.") |
| + time.Sleep(60 * time.Second) |
| + continue |
| + } |
| + if uptimeRes.Err != nil { |
| + logcatLog(logError, "Failed to get uptime: %s", uptimeRes.Err.Error()) |
| return 1 |
| } |
| - if uptime > maxUptime { |
| - logcatLog(logInfo, "Max uptime exceeded: (%s > %s)\n", uptime, maxUptime) |
| + if uptimeRes.Uptime > maxUptime { |
| + logcatLog(logInfo, "Max uptime exceeded: (%s > %s)\n", uptimeRes.Uptime, maxUptime) |
| break |
| } |
| - logcatLog(logInfo, "No need to reboot, uptime < max_uptime: (%s < %s)\n", uptime, maxUptime) |
| - time.Sleep(maxUptime - uptime + time.Second) |
| + logcatLog(logInfo, "No need to reboot, uptime < max_uptime: (%s < %s)\n", uptimeRes.Uptime, maxUptime) |
| + time.Sleep(maxUptime - uptimeRes.Uptime + time.Second) |
|
dnj
2016/09/02 16:17:32
(Before the world forgets, add a comment about why
bpastene
2016/09/02 23:02:13
Done.
|
| } |
| - if err = rebootDevice(); err != nil { |
| + if err := rebootDevice(); err != nil { |
| logcatLog(logError, "Failed to reboot device: %s", err.Error()) |
| return 1 |
| } |