Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(1145)

Unified Diff: go/src/infra/tools/device_watchdog/main.go

Issue 2302193002: Change daemonize logic in watchdog and add timeout to file system read. (Closed)
Patch Set: add todo Created 4 years, 3 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « go/deps.yaml ('k') | no next file » | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: go/src/infra/tools/device_watchdog/main.go
diff --git a/go/src/infra/tools/device_watchdog/main.go b/go/src/infra/tools/device_watchdog/main.go
index af4835ea441e855b26e77f9e9a85af23231e345d..0990089aba92ba90a5c766fcd48fb53880b5fb9a 100644
--- a/go/src/infra/tools/device_watchdog/main.go
+++ b/go/src/infra/tools/device_watchdog/main.go
@@ -17,21 +17,29 @@ package main
import "C"
import (
+ "errors"
"flag"
"fmt"
"io/ioutil"
"os"
"strconv"
"strings"
- "syscall"
"time"
"unsafe"
+ "github.com/VividCortex/godaemon"
"github.com/luci/luci-go/common/runtime/paniccatcher"
)
var (
- logHeader = C.CString("CIT_DeviceWatchdog")
+ logHeader = C.CString("CIT_DeviceWatchdog")
+ errTimeout = errors.New("timeout")
+)
+
+const (
+ stdInFd = 0
+ stdOutFd = 1
+ stdErrFd = 2
)
type logLevel int
@@ -42,12 +50,6 @@ const (
logError
)
-const (
- stdInFd = 0
- stdOutFd = 1
- stdErrFd = 2
-)
-
func (l logLevel) getLogLevel() C.int {
switch l {
case logInfo:
@@ -67,38 +69,15 @@ func logcatLog(level logLevel, format string, args ...interface{}) {
C.__android_log_write(level.getLogLevel(), logHeader, cmsg)
}
-// Spawn a child process via fork, create new process group, chdir and
-// redirect std in and out to /dev/null.
-func daemonize() (int, error) {
- ret, _, errno := syscall.Syscall(syscall.SYS_FORK, 0, 0, 0)
- pid := int(ret)
- if errno != 0 {
- return 0, errno
- }
- if pid > 0 {
- return pid, nil
- }
-
- _, err := syscall.Setsid()
- if err != nil {
- return 0, err
- }
-
- f, err := os.Open("/dev/null")
- if err != nil {
- return 0, err
- }
- fd := f.Fd()
- syscall.Dup2(int(fd), stdInFd)
- syscall.Dup2(int(fd), stdOutFd)
- syscall.Dup2(int(fd), stdErrFd)
-
- return pid, nil
+type uptimeResult struct {
+ Uptime time.Duration
+ Err error
}
// Read from /proc/uptime. Expected format:
// "uptime_in_seconds cpu_idle_time_in_seconds"
-func getDeviceUptime() (time.Duration, error) {
+// Return the uptime via a channel for use with timeouts.
+func readUptime() (time.Duration, error) {
bytes, err := ioutil.ReadFile("/proc/uptime")
if err != nil {
return 0, fmt.Errorf("unable to open /proc/uptime: %s", err.Error())
@@ -115,6 +94,28 @@ func getDeviceUptime() (time.Duration, error) {
return time.Duration(uptime * float64(time.Second)), nil
}
+func getUptime(requestQueue chan<- chan<- uptimeResult, timeoutPeriod time.Duration) (time.Duration, error) {
+ request := make(chan uptimeResult, 1)
+ defer close(request)
+
+ timer := time.NewTimer(timeoutPeriod)
+ defer timer.Stop()
+
+ select {
+ case requestQueue <- request:
+ break
+ case <-timer.C:
+ return 0, errTimeout
+ }
+
+ select {
+ case resp := <-request:
+ return resp.Uptime, resp.Err
+ case <-timer.C:
+ return 0, errTimeout
+ }
+}
+
// Reboot device by writing to sysrq-trigger. See:
// https://www.kernel.org/doc/Documentation/sysrq.txt
func rebootDevice() error {
@@ -131,36 +132,54 @@ func rebootDevice() error {
}
func realMain() int {
+ godaemon.MakeDaemon(&godaemon.DaemonAttr{})
+
maxUptimeFlag := flag.Int("max-uptime", 120, "Maximum uptime in minutes before a reboot is triggered.")
flag.Parse()
- os.Chdir("/")
- pid, err := daemonize()
- if err != nil {
- logcatLog(logError, "Failed to daemonize: %s", err.Error())
- return 1
- }
- if pid > 0 {
- logcatLog(logInfo, "Child spawned with pid %d, exiting parent\n", pid)
- return 0
- }
+ requestQueue := make(chan chan<- uptimeResult)
+ go func() {
+ for request := range requestQueue {
+ uptime, err := readUptime()
+ request <- uptimeResult{Uptime: uptime, Err: err}
+ }
+ }()
+ defer close(requestQueue)
maxUptime := time.Duration(*maxUptimeFlag) * time.Minute
+ consecutiveTimeouts := 0
+ const maxTimeouts = 5
for {
- uptime, err := getDeviceUptime()
- if err != nil {
+ uptime, err := getUptime(requestQueue, 5*time.Second)
+ switch err {
+ case nil:
+ consecutiveTimeouts = 0
+ case errTimeout:
+ consecutiveTimeouts++
+ default:
logcatLog(logError, "Failed to get uptime: %s", err.Error())
return 1
}
+ if consecutiveTimeouts >= maxTimeouts {
+ logcatLog(logError, "%d consective timeouts when fetching uptime. Triggering reboot", consecutiveTimeouts)
+ break
+ }
+ if consecutiveTimeouts > 0 {
+ logcatLog(logError, "Timeout when fetching uptime. Sleeping for 60s and trying again.")
+ time.Sleep(60 * time.Second)
+ continue
+ }
if uptime > maxUptime {
logcatLog(logInfo, "Max uptime exceeded: (%s > %s)\n", uptime, maxUptime)
break
}
logcatLog(logInfo, "No need to reboot, uptime < max_uptime: (%s < %s)\n", uptime, maxUptime)
+ // Add an additional second to the sleep to ensure it doesn't
+ // sleep several times in less than a second.
time.Sleep(maxUptime - uptime + time.Second)
}
- if err = rebootDevice(); err != nil {
+ if err := rebootDevice(); err != nil {
logcatLog(logError, "Failed to reboot device: %s", err.Error())
return 1
}
« no previous file with comments | « go/deps.yaml ('k') | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698