Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(236)

Side by Side Diff: go/src/infra/tools/device_watchdog/main.go

Issue 2302193002: Change daemonize logic in watchdog and add timeout to file system read. (Closed)
Patch Set: Move getuptime to a single goroutine that responds to RPCs Created 4 years, 3 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « no previous file | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright 2016 The Chromium Authors. All rights reserved. 1 // Copyright 2016 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 // +build android 5 // +build android
6 6
7 // Watchdog daemon for android devices. It will attempt to reboot the device 7 // Watchdog daemon for android devices. It will attempt to reboot the device
8 // if its uptime exceeds a specified maximum. 8 // if its uptime exceeds a specified maximum.
9 package main 9 package main
10 10
11 /* 11 /*
12 #cgo LDFLAGS: -landroid -llog 12 #cgo LDFLAGS: -landroid -llog
13 13
14 #include <android/log.h> 14 #include <android/log.h>
15 #include <string.h> 15 #include <string.h>
16 */ 16 */
17 import "C" 17 import "C"
18 18
19 import ( 19 import (
20 "errors"
20 "flag" 21 "flag"
21 "fmt" 22 "fmt"
22 "io/ioutil" 23 "io/ioutil"
23 "os" 24 "os"
25 "os/exec"
24 "strconv" 26 "strconv"
25 "strings" 27 "strings"
26 "syscall" 28 "syscall"
27 "time" 29 "time"
28 "unsafe" 30 "unsafe"
29 31
30 "github.com/luci/luci-go/common/runtime/paniccatcher" 32 "github.com/luci/luci-go/common/runtime/paniccatcher"
31 ) 33 )
32 34
33 var ( 35 var (
34 » logHeader = C.CString("CIT_DeviceWatchdog") 36 » logHeader = C.CString("CIT_DeviceWatchdog")
37 » errTimeout = errors.New("timeout")
38 )
39
40 const (
41 » stdInFd = 0
42 » stdOutFd = 1
43 » stdErrFd = 2
35 ) 44 )
36 45
37 type logLevel int 46 type logLevel int
38 47
39 const ( 48 const (
40 logInfo = iota 49 logInfo = iota
41 logWarning 50 logWarning
42 logError 51 logError
43 ) 52 )
44 53
45 const (
46 stdInFd = 0
47 stdOutFd = 1
48 stdErrFd = 2
49 )
50
51 func (l logLevel) getLogLevel() C.int { 54 func (l logLevel) getLogLevel() C.int {
52 switch l { 55 switch l {
53 case logInfo: 56 case logInfo:
54 return C.ANDROID_LOG_INFO 57 return C.ANDROID_LOG_INFO
55 case logWarning: 58 case logWarning:
56 return C.ANDROID_LOG_WARN 59 return C.ANDROID_LOG_WARN
57 case logError: 60 case logError:
58 return C.ANDROID_LOG_ERROR 61 return C.ANDROID_LOG_ERROR
59 default: 62 default:
60 panic("Unknown log level.") 63 panic("Unknown log level.")
61 } 64 }
62 } 65 }
63 66
64 func logcatLog(level logLevel, format string, args ...interface{}) { 67 func logcatLog(level logLevel, format string, args ...interface{}) {
65 cmsg := C.CString(fmt.Sprintf(format, args...)) 68 cmsg := C.CString(fmt.Sprintf(format, args...))
66 defer C.free(unsafe.Pointer(cmsg)) 69 defer C.free(unsafe.Pointer(cmsg))
67 C.__android_log_write(level.getLogLevel(), logHeader, cmsg) 70 C.__android_log_write(level.getLogLevel(), logHeader, cmsg)
68 } 71 }
69 72
70 // Spawn a child process via fork, create new process group, chdir and 73 // Spawn a child process via exec.
71 // redirect std in and out to /dev/null. 74 func daemonize(maxUptime int) (int, error) {
dnj 2016/09/03 00:48:07 WDYT about having daemonize take an args slice ins
bpastene 2016/09/09 00:51:43 Done, but I'd rather replace argv[0] with /proc/se
72 func daemonize() (int, error) {
73 ret, _, errno := syscall.Syscall(syscall.SYS_FORK, 0, 0, 0) 75 ret, _, errno := syscall.Syscall(syscall.SYS_FORK, 0, 0, 0)
dnj 2016/09/03 00:48:07 Forking here may be a little sketchy. In other dae
bpastene 2016/09/09 00:51:43 I was modeling it after https://chromium.googlesou
bpastene 2016/09/09 19:38:53 Nevermind. This is now all totally obsolete since
74 pid := int(ret) 76 pid := int(ret)
75 if errno != 0 { 77 if errno != 0 {
76 return 0, errno 78 return 0, errno
77 } 79 }
78 if pid > 0 { 80 if pid > 0 {
79 return pid, nil 81 return pid, nil
80 } 82 }
81
82 _, err := syscall.Setsid() 83 _, err := syscall.Setsid()
83 if err != nil { 84 if err != nil {
84 return 0, err 85 return 0, err
85 } 86 }
86
87 f, err := os.Open("/dev/null") 87 f, err := os.Open("/dev/null")
88 if err != nil { 88 if err != nil {
89 return 0, err 89 return 0, err
90 } 90 }
91 fd := f.Fd() 91 fd := f.Fd()
92 syscall.Dup2(int(fd), stdInFd) 92 syscall.Dup2(int(fd), stdInFd)
93 syscall.Dup2(int(fd), stdOutFd) 93 syscall.Dup2(int(fd), stdOutFd)
94 syscall.Dup2(int(fd), stdErrFd) 94 syscall.Dup2(int(fd), stdErrFd)
95 95
96 » return pid, nil 96 » binary, err := os.Readlink("/proc/self/exe")
97 » if err != nil {
98 » » return 0, err
99 » }
100 » cmd := exec.Command(binary, "--max-uptime", strconv.Itoa(maxUptime), "-- skip-daemonize")
101 » err = cmd.Start()
dnj 2016/09/03 00:48:07 You can avoid some junk by using "os.StartProcess"
bpastene 2016/09/09 00:51:43 I'm not sure what junk I'm avoiding, but done. (I
102 » if err != nil {
103 » » return 0, err
104 » }
105 » return cmd.Process.Pid, nil
106 }
107
108 type uptimeResult struct {
109 » Uptime time.Duration
110 » Err error
97 } 111 }
98 112
99 // Read from /proc/uptime. Expected format: 113 // Read from /proc/uptime. Expected format:
100 // "uptime_in_seconds cpu_idle_time_in_seconds" 114 // "uptime_in_seconds cpu_idle_time_in_seconds"
101 func getDeviceUptime() (time.Duration, error) { 115 // Return the uptime via a channel for use with timeouts.
116 func readUptime() (time.Duration, error) {
102 bytes, err := ioutil.ReadFile("/proc/uptime") 117 bytes, err := ioutil.ReadFile("/proc/uptime")
103 if err != nil { 118 if err != nil {
104 return 0, fmt.Errorf("unable to open /proc/uptime: %s", err.Erro r()) 119 return 0, fmt.Errorf("unable to open /proc/uptime: %s", err.Erro r())
105 } 120 }
106 // Split on the space to get uptime and drop cpu idle time. 121 // Split on the space to get uptime and drop cpu idle time.
107 uptimeFields := strings.Fields(string(bytes)) 122 uptimeFields := strings.Fields(string(bytes))
108 if len(uptimeFields) == 0 { 123 if len(uptimeFields) == 0 {
109 return 0, fmt.Errorf("unable to parse /proc/uptime") 124 return 0, fmt.Errorf("unable to parse /proc/uptime")
110 } 125 }
111 uptime, err := strconv.ParseFloat(uptimeFields[0], 64) 126 uptime, err := strconv.ParseFloat(uptimeFields[0], 64)
112 if err != nil { 127 if err != nil {
113 return 0, fmt.Errorf("unable to parse uptime: %s", err.Error()) 128 return 0, fmt.Errorf("unable to parse uptime: %s", err.Error())
114 } 129 }
115 return time.Duration(uptime * float64(time.Second)), nil 130 return time.Duration(uptime * float64(time.Second)), nil
116 } 131 }
117 132
133 func getUptime(requestQueue chan<- chan uptimeResult, timeoutPeriod time.Duratio n) (time.Duration, error) {
dnj 2016/09/03 00:48:07 nit: chan<- chan<- uptimeResult In other words, a
bpastene 2016/09/09 00:51:43 Done, but aren't I reading from those channels lat
dnj 2016/09/09 19:40:57 Yeah a bidirectional channel can be converted into
134 request := make(chan uptimeResult, 1)
135 defer close(request)
136
137 timer := time.NewTimer(timeoutPeriod)
138 defer timer.Stop()
139
140 select {
141 case requestQueue <- request:
142 break
143 case <-timer.C:
144 return 0, errTimeout
145 }
146
147 select {
148 case resp := <-request:
149 return resp.Uptime, resp.Err
150 case <-timer.C:
151 return 0, errTimeout
152 }
153 }
154
118 // Reboot device by writing to sysrq-trigger. See: 155 // Reboot device by writing to sysrq-trigger. See:
119 // https://www.kernel.org/doc/Documentation/sysrq.txt 156 // https://www.kernel.org/doc/Documentation/sysrq.txt
120 func rebootDevice() error { 157 func rebootDevice() error {
121 fd, err := os.OpenFile("/proc/sysrq-trigger", os.O_WRONLY, 0) 158 fd, err := os.OpenFile("/proc/sysrq-trigger", os.O_WRONLY, 0)
122 if err != nil { 159 if err != nil {
123 return fmt.Errorf("Can't open /proc/sysrq-trigger: %s", err.Erro r()) 160 return fmt.Errorf("Can't open /proc/sysrq-trigger: %s", err.Erro r())
124 } 161 }
125 defer fd.Close() 162 defer fd.Close()
126 _, err = fd.Write([]byte("b")) 163 _, err = fd.Write([]byte("b"))
127 if err != nil { 164 if err != nil {
128 return fmt.Errorf("Can't reboot: %s", err.Error()) 165 return fmt.Errorf("Can't reboot: %s", err.Error())
129 } 166 }
130 return fmt.Errorf("I just rebooted. How am I still alive?!?\n") 167 return fmt.Errorf("I just rebooted. How am I still alive?!?\n")
131 } 168 }
132 169
133 func realMain() int { 170 func realMain() int {
134 maxUptimeFlag := flag.Int("max-uptime", 120, "Maximum uptime in minutes before a reboot is triggered.") 171 maxUptimeFlag := flag.Int("max-uptime", 120, "Maximum uptime in minutes before a reboot is triggered.")
172 skipDaemonizeFlag := flag.Bool("skip-daemonize", false, "Skips the daemo nize logic. Otherwise it will spawn a copy daemon and exit.")
135 flag.Parse() 173 flag.Parse()
136 174
137 » os.Chdir("/") 175 » if !*skipDaemonizeFlag {
138 » pid, err := daemonize() 176 » » pid, err := daemonize(*maxUptimeFlag)
139 » if err != nil { 177 » » if err != nil {
140 » » logcatLog(logError, "Failed to daemonize: %s", err.Error()) 178 » » » logcatLog(logError, "Failed to daemonize: %s", err.Error ())
141 » » return 1 179 » » » return 1
142 » } 180 » » }
143 » if pid > 0 {
144 logcatLog(logInfo, "Child spawned with pid %d, exiting parent\n" , pid) 181 logcatLog(logInfo, "Child spawned with pid %d, exiting parent\n" , pid)
145 return 0 182 return 0
146 } 183 }
147 184
185 requestQueue := make(chan chan uptimeResult)
186 go func() {
187 for request := range requestQueue {
188 uptime, err := readUptime()
189 request <- uptimeResult{Uptime: uptime, Err: err}
190 }
191 }()
192 defer close(requestQueue)
193
148 maxUptime := time.Duration(*maxUptimeFlag) * time.Minute 194 maxUptime := time.Duration(*maxUptimeFlag) * time.Minute
195 consecutiveTimeouts := 0
196 maxTimeouts := 5
dnj 2016/09/03 00:48:07 nit: "const maxTimeouts = 5"
bpastene 2016/09/09 00:51:43 Done.
149 for { 197 for {
150 » » uptime, err := getDeviceUptime() 198 » » uptime, err := getUptime(requestQueue, 5*time.Second)
151 » » if err != nil { 199 » » switch err {
200 » » case nil:
201 » » » consecutiveTimeouts = 0
202 » » case errTimeout:
203 » » » consecutiveTimeouts++
204 » » default:
152 logcatLog(logError, "Failed to get uptime: %s", err.Erro r()) 205 logcatLog(logError, "Failed to get uptime: %s", err.Erro r())
153 return 1 206 return 1
154 } 207 }
208 if consecutiveTimeouts == maxTimeouts {
dnj 2016/09/03 00:48:07 nit: Just being paranoid, but might as well make t
bpastene 2016/09/09 00:51:43 Done.
209 logcatLog(logError, "%d consective timeouts when fetchin g uptime. Triggering reboot", maxTimeouts)
dnj 2016/09/03 00:48:07 nit: currentTimeouts, not maxTimeouts.
bpastene 2016/09/09 00:51:43 Done.
210 break
211 } else if consecutiveTimeouts > 0 {
dnj 2016/09/03 00:48:07 nit: No need for "else if", since the previous sta
bpastene 2016/09/09 00:51:43 Done.
212 logcatLog(logError, "Timeout when fetching uptime. Sleep ing for 60s and trying again.")
213 time.Sleep(60 * time.Second)
214 continue
215 }
155 216
156 if uptime > maxUptime { 217 if uptime > maxUptime {
157 logcatLog(logInfo, "Max uptime exceeded: (%s > %s)\n", u ptime, maxUptime) 218 logcatLog(logInfo, "Max uptime exceeded: (%s > %s)\n", u ptime, maxUptime)
158 break 219 break
159 } 220 }
160 logcatLog(logInfo, "No need to reboot, uptime < max_uptime: (%s < %s)\n", uptime, maxUptime) 221 logcatLog(logInfo, "No need to reboot, uptime < max_uptime: (%s < %s)\n", uptime, maxUptime)
222 // Add an additional second to the sleep to ensure it doesn't
223 // sleep several times in less than a second.
161 time.Sleep(maxUptime - uptime + time.Second) 224 time.Sleep(maxUptime - uptime + time.Second)
162 } 225 }
163 » if err = rebootDevice(); err != nil { 226 » if err := rebootDevice(); err != nil {
164 logcatLog(logError, "Failed to reboot device: %s", err.Error()) 227 logcatLog(logError, "Failed to reboot device: %s", err.Error())
165 return 1 228 return 1
166 } 229 }
167 return 0 230 return 0
168 } 231 }
169 232
170 func main() { 233 func main() {
171 paniccatcher.Do(func() { 234 paniccatcher.Do(func() {
172 os.Exit(realMain()) 235 os.Exit(realMain())
173 }, func(p *paniccatcher.Panic) { 236 }, func(p *paniccatcher.Panic) {
174 logcatLog(logError, "Panic: %s\n%s", p.Reason, p.Stack) 237 logcatLog(logError, "Panic: %s\n%s", p.Reason, p.Stack)
175 os.Exit(1) 238 os.Exit(1)
176 }) 239 })
177 } 240 }
OLDNEW
« no previous file with comments | « no previous file | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698