Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(302)

Side by Side Diff: go/src/infra/tools/device_watchdog/main.go

Issue 2241963002: Implement device watchdog. (Closed) Base URL: https://chromium.googlesource.com/infra/infra.git@master
Patch Set: Comments Created 4 years, 4 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « no previous file | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // +build !windows 1 // Copyright 2016 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
2 4
5 // +build android
6
7 // Watchdog daemon for android devices. It will attempt to reboot the device
8 // if its uptime exceeds a specified maximum.
3 package main 9 package main
4 10
11 /*
12 #cgo LDFLAGS: -landroid -llog
13
14 #include <android/log.h>
15 #include <string.h>
16 */
17 import "C"
18
5 import ( 19 import (
6 » "C" 20 » "flag"
7 "fmt" 21 "fmt"
22 "io/ioutil"
23 "math"
24 "os"
25 "strconv"
26 "strings"
27 "syscall"
28 "time"
29 "unsafe"
30
31 "github.com/luci/luci-go/common/runtime/paniccatcher"
8 ) 32 )
9 33
34 var (
35 logHeader = C.CString("CIT_DeviceWatchdog")
36 )
37
38 type logLevel int
39
40 const (
41 logInfo = iota
42 logWarning
43 logError
44 )
45
46 const (
47 stdInFd = 0
48 stdOutFd = 1
49 stdErrFd = 2
50 )
51
52 func (l logLevel) getLogLevel() C.int {
53 switch l {
54 case logInfo:
55 return C.ANDROID_LOG_INFO
56 case logWarning:
57 return C.ANDROID_LOG_WARN
58 case logError:
59 return C.ANDROID_LOG_ERROR
60 default:
61 panic("Unknown log level.")
62 }
63 }
64
65 func logcatLog(level logLevel, format string, args ...interface{}) {
66 cmsg := C.CString(fmt.Sprintf(format, args...))
67 defer C.free(unsafe.Pointer(cmsg))
68 C.__android_log_write(level.getLogLevel(), logHeader, cmsg)
69 }
70
71 // Spawn a child process via fork, create new process group, chdir and
72 // redirect std in and out to /dev/null.
73 func daemonize() (int, error) {
74 os.Chdir("/")
dnj 2016/08/15 23:02:14 (I meant move this to realMain)
bpastene 2016/08/15 23:39:28 Done.
75
76 ret, _, errno := syscall.Syscall(syscall.SYS_FORK, 0, 0, 0)
77 pid := int(ret)
78 if errno != 0 {
79 return 0, errno
80 }
81 if pid > 0 {
82 return pid, nil
83 }
84
85 _, err := syscall.Setsid()
86 if err != nil {
87 return 0, err
88 }
89
90 f, err := os.Open("/dev/null")
91 if err != nil {
92 return 0, err
93 }
94 fd := f.Fd()
95 syscall.Dup2(int(fd), stdInFd)
96 syscall.Dup2(int(fd), stdOutFd)
97 syscall.Dup2(int(fd), stdErrFd)
98
99 return pid, nil
100 }
101
102 // Read from /proc/uptime. Expected format:
103 // "uptime_in_seconds cpu_idle_time_in_seconds"
104 func getDeviceUptime() (time.Duration, error) {
105 bytes, err := ioutil.ReadFile("/proc/uptime")
106 if err != nil {
107 return 0, fmt.Errorf("unable to open /proc/uptime: %s", err.Err or())
dnj 2016/08/15 23:02:14 nit: two spaces
bpastene 2016/08/15 23:39:28 Done.
108 }
109 // Split on the space to get uptime and drop cpu idle time.
110 uptimeFields := strings.Fields(string(bytes))
111 if len(uptimeFields) == 0 {
112 return 0, fmt.Errorf("unable to parse /proc/uptime")
113 }
114 uptime, err := strconv.ParseFloat(uptimeFields[0], 64)
115 if err != nil {
116 return 0, fmt.Errorf("unable to parse uptime: %s", err.Error())
117 }
118 return time.Duration(uptime * float64(time.Second)), nil
119 }
120
121 // Reboot device by writing to sysrq-trigger. See:
122 // https://www.kernel.org/doc/Documentation/sysrq.txt
123 func rebootDevice() {
124 fd, err := os.OpenFile("/proc/sysrq-trigger", os.O_WRONLY, 0644)
dnj 2016/08/15 23:02:14 nit: get rid of 0644 since you're not creating a f
bpastene 2016/08/15 23:39:28 Needs it: https://golang.org/pkg/os/#OpenFile
dnj 2016/08/15 23:50:03 I was thinking pass 0 to be clear that you're not
bpastene 2016/08/16 19:47:26 Done.
125 if err != nil {
126 logcatLog(logError, "Can't open /proc/sysrq-trigger: %s", err.Er ror())
127 os.Exit(1)
128 }
129 defer fd.Close()
130 _, err = fd.Write([]byte("b"))
dnj 2016/08/15 23:02:14 nit, oneline: if _, err := fd.Write(...); err != n
bpastene 2016/08/15 23:39:28 That makes it more readable? I'm not sure I agree
dnj 2016/08/15 23:50:02 It's a coding preference we've exhibited in LUCI p
131 if err != nil {
132 logcatLog(logError, "Can't reboot: %s", err.Error())
133 os.Exit(1)
dnj 2016/08/15 23:02:14 WDYT about having this function actually return an
bpastene 2016/08/15 23:39:28 Done.
134 }
135 logcatLog(logError, "I just rebooted. How am I still alive?!?\n")
136 os.Exit(1)
137 }
138
139 func realMain() {
dnj 2016/08/15 23:02:14 Just a thought, but if you made "realMain" return
bpastene 2016/08/15 23:39:28 Good idea; done.
140
141 maxUptimeFlag := flag.Int("max-uptime", 120, "Maximum uptime in minutes before a reboot is triggered.")
dnj 2016/08/15 23:02:14 If you wanted to, you could use a clockflag.Durati
bpastene 2016/08/15 23:39:28 I think that would add a lot more time.Duration-ty
142 flag.Parse()
143
144 pid, err := daemonize()
145 if err != nil {
146 logcatLog(logError, "Failed to daemonize: %s", err.Error())
147 os.Exit(1)
148 }
149 if pid > 0 {
150 logcatLog(logInfo, "Child spawned with pid %d, exiting parent\n" , pid)
151 os.Exit(0)
152 }
153
154 maxUptime := time.Duration(int64(*maxUptimeFlag) * int64(time.Minute))
dnj 2016/08/15 23:02:14 Make this: maxUptime := time.Duration(*maxUptimeFl
bpastene 2016/08/15 23:39:28 Done.
155 for {
156 uptime, err := getDeviceUptime()
157 if err != nil {
158 logcatLog(logError, "Failed to get uptime: %s", err.Erro r())
159 os.Exit(1)
160 }
161
162 if uptime > maxUptime {
163 logcatLog(logInfo, "Max uptime exceeded: (%.2f > %.2f)\n ", float64(uptime)/float64(time.Minute), float64(maxUptime)/float64(time.Minute) )
164 rebootDevice()
dnj 2016/08/15 23:02:14 Suggestion: break this into two loops: // Wait un
bpastene 2016/08/15 23:39:28 If that first reboot attempt doesn't work, why wou
dnj 2016/08/15 23:50:02 My thought was that this is a terminal point, so y
bpastene 2016/08/16 19:47:26 With 1), it doesn't silently disappear, its error
dnj 2016/08/16 19:51:47 Okay seems fair. Then I would recommend still brea
bpastene 2016/08/16 22:00:56 Done.
165 } else {
166 logcatLog(logInfo, "No need to reboot, uptime < max_upti me: (%.2f < %.2f)\n", float64(uptime)/float64(time.Minute), float64(maxUptime)/f loat64(time.Minute))
167 }
168 maxSleep := math.Max(float64(maxUptime-uptime), float64(time.Sec ond))
dnj 2016/08/15 23:02:14 Confused: you know "maxUptime" is >= "uptime", so
bpastene 2016/08/15 23:39:28 Thanks to obscure floating point precision issues,
dnj 2016/08/15 23:50:03 That's weird. Maybe just have it sleep fro (maxUpt
bpastene 2016/08/16 19:47:26 Ahh, the + 1 second is good idea. I'll do that.
169 time.Sleep(time.Duration(maxSleep))
170 }
171 }
172
10 func main() { 173 func main() {
11 » fmt.Println("Is this thing working?") 174 » paniccatcher.Do(realMain, func(p *paniccatcher.Panic) {
175 » » logcatLog(logError, "Panic: %s\n%s", p.Reason, p.Stack)
176 » » os.Exit(1)
177 » })
12 } 178 }
OLDNEW
« no previous file with comments | « no previous file | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698