Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(35)

Unified Diff: go/src/infra/tools/device_watchdog/main.go

Issue 2302193002: Change daemonize logic in watchdog and add timeout to file system read. (Closed)
Patch Set: Move getuptime to a single goroutine that responds to RPCs Created 4 years, 3 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « no previous file | no next file » | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: go/src/infra/tools/device_watchdog/main.go
diff --git a/go/src/infra/tools/device_watchdog/main.go b/go/src/infra/tools/device_watchdog/main.go
index af4835ea441e855b26e77f9e9a85af23231e345d..2b31837332879bdcf2081a0cfa611c7b8443896e 100644
--- a/go/src/infra/tools/device_watchdog/main.go
+++ b/go/src/infra/tools/device_watchdog/main.go
@@ -17,10 +17,12 @@ package main
import "C"
import (
+ "errors"
"flag"
"fmt"
"io/ioutil"
"os"
+ "os/exec"
"strconv"
"strings"
"syscall"
@@ -31,7 +33,14 @@ import (
)
var (
- logHeader = C.CString("CIT_DeviceWatchdog")
+ logHeader = C.CString("CIT_DeviceWatchdog")
+ errTimeout = errors.New("timeout")
+)
+
+const (
+ stdInFd = 0
+ stdOutFd = 1
+ stdErrFd = 2
)
type logLevel int
@@ -42,12 +51,6 @@ const (
logError
)
-const (
- stdInFd = 0
- stdOutFd = 1
- stdErrFd = 2
-)
-
func (l logLevel) getLogLevel() C.int {
switch l {
case logInfo:
@@ -67,9 +70,8 @@ func logcatLog(level logLevel, format string, args ...interface{}) {
C.__android_log_write(level.getLogLevel(), logHeader, cmsg)
}
-// Spawn a child process via fork, create new process group, chdir and
-// redirect std in and out to /dev/null.
-func daemonize() (int, error) {
+// Spawn a child process via exec.
+func daemonize(maxUptime int) (int, error) {
dnj 2016/09/03 00:48:07 WDYT about having daemonize take an args slice ins
bpastene 2016/09/09 00:51:43 Done, but I'd rather replace argv[0] with /proc/se
ret, _, errno := syscall.Syscall(syscall.SYS_FORK, 0, 0, 0)
dnj 2016/09/03 00:48:07 Forking here may be a little sketchy. In other dae
bpastene 2016/09/09 00:51:43 I was modeling it after https://chromium.googlesou
bpastene 2016/09/09 19:38:53 Nevermind. This is now all totally obsolete since
pid := int(ret)
if errno != 0 {
@@ -78,12 +80,10 @@ func daemonize() (int, error) {
if pid > 0 {
return pid, nil
}
-
_, err := syscall.Setsid()
if err != nil {
return 0, err
}
-
f, err := os.Open("/dev/null")
if err != nil {
return 0, err
@@ -93,12 +93,27 @@ func daemonize() (int, error) {
syscall.Dup2(int(fd), stdOutFd)
syscall.Dup2(int(fd), stdErrFd)
- return pid, nil
+ binary, err := os.Readlink("/proc/self/exe")
+ if err != nil {
+ return 0, err
+ }
+ cmd := exec.Command(binary, "--max-uptime", strconv.Itoa(maxUptime), "--skip-daemonize")
+ err = cmd.Start()
dnj 2016/09/03 00:48:07 You can avoid some junk by using "os.StartProcess"
bpastene 2016/09/09 00:51:43 I'm not sure what junk I'm avoiding, but done. (I
+ if err != nil {
+ return 0, err
+ }
+ return cmd.Process.Pid, nil
+}
+
+type uptimeResult struct {
+ Uptime time.Duration
+ Err error
}
// Read from /proc/uptime. Expected format:
// "uptime_in_seconds cpu_idle_time_in_seconds"
-func getDeviceUptime() (time.Duration, error) {
+// Return the uptime via a channel for use with timeouts.
+func readUptime() (time.Duration, error) {
bytes, err := ioutil.ReadFile("/proc/uptime")
if err != nil {
return 0, fmt.Errorf("unable to open /proc/uptime: %s", err.Error())
@@ -115,6 +130,28 @@ func getDeviceUptime() (time.Duration, error) {
return time.Duration(uptime * float64(time.Second)), nil
}
+func getUptime(requestQueue chan<- chan uptimeResult, timeoutPeriod time.Duration) (time.Duration, error) {
dnj 2016/09/03 00:48:07 nit: chan<- chan<- uptimeResult In other words, a
bpastene 2016/09/09 00:51:43 Done, but aren't I reading from those channels lat
dnj 2016/09/09 19:40:57 Yeah a bidirectional channel can be converted into
+ request := make(chan uptimeResult, 1)
+ defer close(request)
+
+ timer := time.NewTimer(timeoutPeriod)
+ defer timer.Stop()
+
+ select {
+ case requestQueue <- request:
+ break
+ case <-timer.C:
+ return 0, errTimeout
+ }
+
+ select {
+ case resp := <-request:
+ return resp.Uptime, resp.Err
+ case <-timer.C:
+ return 0, errTimeout
+ }
+}
+
// Reboot device by writing to sysrq-trigger. See:
// https://www.kernel.org/doc/Documentation/sysrq.txt
func rebootDevice() error {
@@ -132,35 +169,61 @@ func rebootDevice() error {
func realMain() int {
maxUptimeFlag := flag.Int("max-uptime", 120, "Maximum uptime in minutes before a reboot is triggered.")
+ skipDaemonizeFlag := flag.Bool("skip-daemonize", false, "Skips the daemonize logic. Otherwise it will spawn a copy daemon and exit.")
flag.Parse()
- os.Chdir("/")
- pid, err := daemonize()
- if err != nil {
- logcatLog(logError, "Failed to daemonize: %s", err.Error())
- return 1
- }
- if pid > 0 {
+ if !*skipDaemonizeFlag {
+ pid, err := daemonize(*maxUptimeFlag)
+ if err != nil {
+ logcatLog(logError, "Failed to daemonize: %s", err.Error())
+ return 1
+ }
logcatLog(logInfo, "Child spawned with pid %d, exiting parent\n", pid)
return 0
}
+ requestQueue := make(chan chan uptimeResult)
+ go func() {
+ for request := range requestQueue {
+ uptime, err := readUptime()
+ request <- uptimeResult{Uptime: uptime, Err: err}
+ }
+ }()
+ defer close(requestQueue)
+
maxUptime := time.Duration(*maxUptimeFlag) * time.Minute
+ consecutiveTimeouts := 0
+ maxTimeouts := 5
dnj 2016/09/03 00:48:07 nit: "const maxTimeouts = 5"
bpastene 2016/09/09 00:51:43 Done.
for {
- uptime, err := getDeviceUptime()
- if err != nil {
+ uptime, err := getUptime(requestQueue, 5*time.Second)
+ switch err {
+ case nil:
+ consecutiveTimeouts = 0
+ case errTimeout:
+ consecutiveTimeouts++
+ default:
logcatLog(logError, "Failed to get uptime: %s", err.Error())
return 1
}
+ if consecutiveTimeouts == maxTimeouts {
dnj 2016/09/03 00:48:07 nit: Just being paranoid, but might as well make t
bpastene 2016/09/09 00:51:43 Done.
+ logcatLog(logError, "%d consective timeouts when fetching uptime. Triggering reboot", maxTimeouts)
dnj 2016/09/03 00:48:07 nit: currentTimeouts, not maxTimeouts.
bpastene 2016/09/09 00:51:43 Done.
+ break
+ } else if consecutiveTimeouts > 0 {
dnj 2016/09/03 00:48:07 nit: No need for "else if", since the previous sta
bpastene 2016/09/09 00:51:43 Done.
+ logcatLog(logError, "Timeout when fetching uptime. Sleeping for 60s and trying again.")
+ time.Sleep(60 * time.Second)
+ continue
+ }
if uptime > maxUptime {
logcatLog(logInfo, "Max uptime exceeded: (%s > %s)\n", uptime, maxUptime)
break
}
logcatLog(logInfo, "No need to reboot, uptime < max_uptime: (%s < %s)\n", uptime, maxUptime)
+ // Add an additional second to the sleep to ensure it doesn't
+ // sleep several times in less than a second.
time.Sleep(maxUptime - uptime + time.Second)
}
- if err = rebootDevice(); err != nil {
+ if err := rebootDevice(); err != nil {
logcatLog(logError, "Failed to reboot device: %s", err.Error())
return 1
}
« no previous file with comments | « no previous file | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698