Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(347)

Unified Diff: go/src/infra/tools/device_watchdog/main.go

Issue 2302193002: Change daemonize logic in watchdog and add timeout to file system read. (Closed)
Patch Set: Created 4 years, 3 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « no previous file | no next file » | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: go/src/infra/tools/device_watchdog/main.go
diff --git a/go/src/infra/tools/device_watchdog/main.go b/go/src/infra/tools/device_watchdog/main.go
index af4835ea441e855b26e77f9e9a85af23231e345d..7f01052e3999fae2dc3860529ba7d5ba66053c30 100644
--- a/go/src/infra/tools/device_watchdog/main.go
+++ b/go/src/infra/tools/device_watchdog/main.go
@@ -21,9 +21,9 @@ import (
"fmt"
"io/ioutil"
"os"
+ "os/exec"
"strconv"
"strings"
- "syscall"
"time"
"unsafe"
@@ -42,12 +42,6 @@ const (
logError
)
-const (
- stdInFd = 0
- stdOutFd = 1
- stdErrFd = 2
-)
-
func (l logLevel) getLogLevel() C.int {
switch l {
case logInfo:
@@ -67,52 +61,46 @@ func logcatLog(level logLevel, format string, args ...interface{}) {
C.__android_log_write(level.getLogLevel(), logHeader, cmsg)
}
-// Spawn a child process via fork, create new process group, chdir and
-// redirect std in and out to /dev/null.
-func daemonize() (int, error) {
- ret, _, errno := syscall.Syscall(syscall.SYS_FORK, 0, 0, 0)
- pid := int(ret)
- if errno != 0 {
- return 0, errno
- }
- if pid > 0 {
- return pid, nil
- }
-
- _, err := syscall.Setsid()
+// Spawn a child process via exec.
+func daemonize(maxUptime int) (int, error) {
dnj 2016/09/02 16:17:32 This isn't a full daemonize workflow. It doesn't s
bpastene 2016/09/02 23:02:13 There were problems with threads not correctly get
+ binary, err := os.Readlink("/proc/self/exe")
if err != nil {
return 0, err
}
-
- f, err := os.Open("/dev/null")
+ cmd := exec.Command(binary, "--max-uptime", strconv.Itoa(maxUptime), "--skip-daemonize")
+ err = cmd.Start()
if err != nil {
return 0, err
}
- fd := f.Fd()
- syscall.Dup2(int(fd), stdInFd)
- syscall.Dup2(int(fd), stdOutFd)
- syscall.Dup2(int(fd), stdErrFd)
+ return cmd.Process.Pid, nil
+}
- return pid, nil
+type uptimeResult struct {
+ Uptime time.Duration
+ Err error
}
// Read from /proc/uptime. Expected format:
// "uptime_in_seconds cpu_idle_time_in_seconds"
-func getDeviceUptime() (time.Duration, error) {
+// Return the uptime via a channel for use with timeouts.
+func getDeviceUptime(c chan uptimeResult) {
dnj 2016/09/02 16:17:32 Make this a directional channel: c chan<- uptimeRe
bpastene 2016/09/02 23:02:13 Done.
bytes, err := ioutil.ReadFile("/proc/uptime")
if err != nil {
- return 0, fmt.Errorf("unable to open /proc/uptime: %s", err.Error())
+ c <- uptimeResult{Uptime: 0, Err: fmt.Errorf("unable to open /proc/uptime: %s", err.Error())}
+ return
}
// Split on the space to get uptime and drop cpu idle time.
uptimeFields := strings.Fields(string(bytes))
if len(uptimeFields) == 0 {
- return 0, fmt.Errorf("unable to parse /proc/uptime")
+ c <- uptimeResult{Uptime: 0, Err: fmt.Errorf("unable to parse /proc/uptime")}
+ return
}
uptime, err := strconv.ParseFloat(uptimeFields[0], 64)
if err != nil {
- return 0, fmt.Errorf("unable to parse uptime: %s", err.Error())
+ c <- uptimeResult{Uptime: 0, Err: fmt.Errorf("unable to parse uptime: %s", err.Error())}
+ return
}
- return time.Duration(uptime * float64(time.Second)), nil
+ c <- uptimeResult{Uptime: time.Duration(uptime * float64(time.Second)), Err: nil}
}
// Reboot device by writing to sysrq-trigger. See:
@@ -132,35 +120,53 @@ func rebootDevice() error {
func realMain() int {
maxUptimeFlag := flag.Int("max-uptime", 120, "Maximum uptime in minutes before a reboot is triggered.")
+ skipDaemonizeFlag := flag.Bool("skip-daemonize", false, "Skips the daemonize logic. Otherwise it will spawn a copy daemon and exit.")
flag.Parse()
- os.Chdir("/")
- pid, err := daemonize()
- if err != nil {
- logcatLog(logError, "Failed to daemonize: %s", err.Error())
- return 1
- }
- if pid > 0 {
+ if !*skipDaemonizeFlag {
+ pid, err := daemonize(*maxUptimeFlag)
+ if err != nil {
+ logcatLog(logError, "Failed to daemonize: %s", err.Error())
+ return 1
+ }
logcatLog(logInfo, "Child spawned with pid %d, exiting parent\n", pid)
return 0
}
+ var uptimeRes uptimeResult
maxUptime := time.Duration(*maxUptimeFlag) * time.Minute
+ consecutiveTimeouts := 0
+ maxTimeouts := 5
for {
- uptime, err := getDeviceUptime()
- if err != nil {
- logcatLog(logError, "Failed to get uptime: %s", err.Error())
+ c := make(chan uptimeResult, 1)
+ go getDeviceUptime(c)
bpastene 2016/09/02 00:05:22 Uses a go routine + select statement to add a time
dnj 2016/09/02 16:17:32 Select is correct, but you launch a new goroutine
bpastene 2016/09/02 23:02:13 Thanks for the suggestion Dan. I added the single
+ select {
+ case uptimeRes = <-c:
+ consecutiveTimeouts = 0
+ case <-time.After(5 * time.Second):
+ consecutiveTimeouts++
+ }
+ if consecutiveTimeouts == maxTimeouts {
+ logcatLog(logError, "%d consective timeouts when fetching uptime. Triggering reboot", maxTimeouts)
+ break
+ } else if consecutiveTimeouts > 0 {
+ logcatLog(logError, "Timeout when fetching uptime. Sleeping for 60s and trying again.")
+ time.Sleep(60 * time.Second)
+ continue
+ }
+ if uptimeRes.Err != nil {
+ logcatLog(logError, "Failed to get uptime: %s", uptimeRes.Err.Error())
return 1
}
- if uptime > maxUptime {
- logcatLog(logInfo, "Max uptime exceeded: (%s > %s)\n", uptime, maxUptime)
+ if uptimeRes.Uptime > maxUptime {
+ logcatLog(logInfo, "Max uptime exceeded: (%s > %s)\n", uptimeRes.Uptime, maxUptime)
break
}
- logcatLog(logInfo, "No need to reboot, uptime < max_uptime: (%s < %s)\n", uptime, maxUptime)
- time.Sleep(maxUptime - uptime + time.Second)
+ logcatLog(logInfo, "No need to reboot, uptime < max_uptime: (%s < %s)\n", uptimeRes.Uptime, maxUptime)
+ time.Sleep(maxUptime - uptimeRes.Uptime + time.Second)
dnj 2016/09/02 16:17:32 (Before the world forgets, add a comment about why
bpastene 2016/09/02 23:02:13 Done.
}
- if err = rebootDevice(); err != nil {
+ if err := rebootDevice(); err != nil {
logcatLog(logError, "Failed to reboot device: %s", err.Error())
return 1
}
« no previous file with comments | « no previous file | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698