Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(134)

Side by Side Diff: go/src/infra/tools/device_watchdog/main.go

Issue 2302193002: Change daemonize logic in watchdog and add timeout to file system read. (Closed)
Patch Set: add todo Created 4 years, 3 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « go/deps.yaml ('k') | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright 2016 The Chromium Authors. All rights reserved. 1 // Copyright 2016 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 // +build android 5 // +build android
6 6
7 // Watchdog daemon for android devices. It will attempt to reboot the device 7 // Watchdog daemon for android devices. It will attempt to reboot the device
8 // if its uptime exceeds a specified maximum. 8 // if its uptime exceeds a specified maximum.
9 package main 9 package main
10 10
11 /* 11 /*
12 #cgo LDFLAGS: -landroid -llog 12 #cgo LDFLAGS: -landroid -llog
13 13
14 #include <android/log.h> 14 #include <android/log.h>
15 #include <string.h> 15 #include <string.h>
16 */ 16 */
17 import "C" 17 import "C"
18 18
19 import ( 19 import (
20 "errors"
20 "flag" 21 "flag"
21 "fmt" 22 "fmt"
22 "io/ioutil" 23 "io/ioutil"
23 "os" 24 "os"
24 "strconv" 25 "strconv"
25 "strings" 26 "strings"
26 "syscall"
27 "time" 27 "time"
28 "unsafe" 28 "unsafe"
29 29
30 "github.com/VividCortex/godaemon"
30 "github.com/luci/luci-go/common/runtime/paniccatcher" 31 "github.com/luci/luci-go/common/runtime/paniccatcher"
31 ) 32 )
32 33
33 var ( 34 var (
34 » logHeader = C.CString("CIT_DeviceWatchdog") 35 » logHeader = C.CString("CIT_DeviceWatchdog")
36 » errTimeout = errors.New("timeout")
37 )
38
39 const (
40 » stdInFd = 0
41 » stdOutFd = 1
42 » stdErrFd = 2
35 ) 43 )
36 44
37 type logLevel int 45 type logLevel int
38 46
39 const ( 47 const (
40 logInfo = iota 48 logInfo = iota
41 logWarning 49 logWarning
42 logError 50 logError
43 ) 51 )
44 52
45 const (
46 stdInFd = 0
47 stdOutFd = 1
48 stdErrFd = 2
49 )
50
51 func (l logLevel) getLogLevel() C.int { 53 func (l logLevel) getLogLevel() C.int {
52 switch l { 54 switch l {
53 case logInfo: 55 case logInfo:
54 return C.ANDROID_LOG_INFO 56 return C.ANDROID_LOG_INFO
55 case logWarning: 57 case logWarning:
56 return C.ANDROID_LOG_WARN 58 return C.ANDROID_LOG_WARN
57 case logError: 59 case logError:
58 return C.ANDROID_LOG_ERROR 60 return C.ANDROID_LOG_ERROR
59 default: 61 default:
60 panic("Unknown log level.") 62 panic("Unknown log level.")
61 } 63 }
62 } 64 }
63 65
64 func logcatLog(level logLevel, format string, args ...interface{}) { 66 func logcatLog(level logLevel, format string, args ...interface{}) {
65 cmsg := C.CString(fmt.Sprintf(format, args...)) 67 cmsg := C.CString(fmt.Sprintf(format, args...))
66 defer C.free(unsafe.Pointer(cmsg)) 68 defer C.free(unsafe.Pointer(cmsg))
67 C.__android_log_write(level.getLogLevel(), logHeader, cmsg) 69 C.__android_log_write(level.getLogLevel(), logHeader, cmsg)
68 } 70 }
69 71
70 // Spawn a child process via fork, create new process group, chdir and 72 type uptimeResult struct {
71 // redirect std in and out to /dev/null. 73 » Uptime time.Duration
72 func daemonize() (int, error) { 74 » Err error
73 » ret, _, errno := syscall.Syscall(syscall.SYS_FORK, 0, 0, 0)
74 » pid := int(ret)
75 » if errno != 0 {
76 » » return 0, errno
77 » }
78 » if pid > 0 {
79 » » return pid, nil
80 » }
81
82 » _, err := syscall.Setsid()
83 » if err != nil {
84 » » return 0, err
85 » }
86
87 » f, err := os.Open("/dev/null")
88 » if err != nil {
89 » » return 0, err
90 » }
91 » fd := f.Fd()
92 » syscall.Dup2(int(fd), stdInFd)
93 » syscall.Dup2(int(fd), stdOutFd)
94 » syscall.Dup2(int(fd), stdErrFd)
95
96 » return pid, nil
97 } 75 }
98 76
99 // Read from /proc/uptime. Expected format: 77 // Read from /proc/uptime. Expected format:
100 // "uptime_in_seconds cpu_idle_time_in_seconds" 78 // "uptime_in_seconds cpu_idle_time_in_seconds"
101 func getDeviceUptime() (time.Duration, error) { 79 // Return the uptime via a channel for use with timeouts.
80 func readUptime() (time.Duration, error) {
102 bytes, err := ioutil.ReadFile("/proc/uptime") 81 bytes, err := ioutil.ReadFile("/proc/uptime")
103 if err != nil { 82 if err != nil {
104 return 0, fmt.Errorf("unable to open /proc/uptime: %s", err.Erro r()) 83 return 0, fmt.Errorf("unable to open /proc/uptime: %s", err.Erro r())
105 } 84 }
106 // Split on the space to get uptime and drop cpu idle time. 85 // Split on the space to get uptime and drop cpu idle time.
107 uptimeFields := strings.Fields(string(bytes)) 86 uptimeFields := strings.Fields(string(bytes))
108 if len(uptimeFields) == 0 { 87 if len(uptimeFields) == 0 {
109 return 0, fmt.Errorf("unable to parse /proc/uptime") 88 return 0, fmt.Errorf("unable to parse /proc/uptime")
110 } 89 }
111 uptime, err := strconv.ParseFloat(uptimeFields[0], 64) 90 uptime, err := strconv.ParseFloat(uptimeFields[0], 64)
112 if err != nil { 91 if err != nil {
113 return 0, fmt.Errorf("unable to parse uptime: %s", err.Error()) 92 return 0, fmt.Errorf("unable to parse uptime: %s", err.Error())
114 } 93 }
115 return time.Duration(uptime * float64(time.Second)), nil 94 return time.Duration(uptime * float64(time.Second)), nil
116 } 95 }
117 96
97 func getUptime(requestQueue chan<- chan<- uptimeResult, timeoutPeriod time.Durat ion) (time.Duration, error) {
98 request := make(chan uptimeResult, 1)
99 defer close(request)
100
101 timer := time.NewTimer(timeoutPeriod)
102 defer timer.Stop()
103
104 select {
105 case requestQueue <- request:
106 break
107 case <-timer.C:
108 return 0, errTimeout
109 }
110
111 select {
112 case resp := <-request:
113 return resp.Uptime, resp.Err
114 case <-timer.C:
115 return 0, errTimeout
116 }
117 }
118
118 // Reboot device by writing to sysrq-trigger. See: 119 // Reboot device by writing to sysrq-trigger. See:
119 // https://www.kernel.org/doc/Documentation/sysrq.txt 120 // https://www.kernel.org/doc/Documentation/sysrq.txt
120 func rebootDevice() error { 121 func rebootDevice() error {
121 fd, err := os.OpenFile("/proc/sysrq-trigger", os.O_WRONLY, 0) 122 fd, err := os.OpenFile("/proc/sysrq-trigger", os.O_WRONLY, 0)
122 if err != nil { 123 if err != nil {
123 return fmt.Errorf("Can't open /proc/sysrq-trigger: %s", err.Erro r()) 124 return fmt.Errorf("Can't open /proc/sysrq-trigger: %s", err.Erro r())
124 } 125 }
125 defer fd.Close() 126 defer fd.Close()
126 _, err = fd.Write([]byte("b")) 127 _, err = fd.Write([]byte("b"))
127 if err != nil { 128 if err != nil {
128 return fmt.Errorf("Can't reboot: %s", err.Error()) 129 return fmt.Errorf("Can't reboot: %s", err.Error())
129 } 130 }
130 return fmt.Errorf("I just rebooted. How am I still alive?!?\n") 131 return fmt.Errorf("I just rebooted. How am I still alive?!?\n")
131 } 132 }
132 133
133 func realMain() int { 134 func realMain() int {
135 godaemon.MakeDaemon(&godaemon.DaemonAttr{})
136
134 maxUptimeFlag := flag.Int("max-uptime", 120, "Maximum uptime in minutes before a reboot is triggered.") 137 maxUptimeFlag := flag.Int("max-uptime", 120, "Maximum uptime in minutes before a reboot is triggered.")
135 flag.Parse() 138 flag.Parse()
136 139
137 » os.Chdir("/") 140 » requestQueue := make(chan chan<- uptimeResult)
138 » pid, err := daemonize() 141 » go func() {
139 » if err != nil { 142 » » for request := range requestQueue {
140 » » logcatLog(logError, "Failed to daemonize: %s", err.Error()) 143 » » » uptime, err := readUptime()
141 » » return 1 144 » » » request <- uptimeResult{Uptime: uptime, Err: err}
142 » } 145 » » }
143 » if pid > 0 { 146 » }()
144 » » logcatLog(logInfo, "Child spawned with pid %d, exiting parent\n" , pid) 147 » defer close(requestQueue)
145 » » return 0
146 » }
147 148
148 maxUptime := time.Duration(*maxUptimeFlag) * time.Minute 149 maxUptime := time.Duration(*maxUptimeFlag) * time.Minute
150 consecutiveTimeouts := 0
151 const maxTimeouts = 5
149 for { 152 for {
150 » » uptime, err := getDeviceUptime() 153 » » uptime, err := getUptime(requestQueue, 5*time.Second)
151 » » if err != nil { 154 » » switch err {
155 » » case nil:
156 » » » consecutiveTimeouts = 0
157 » » case errTimeout:
158 » » » consecutiveTimeouts++
159 » » default:
152 logcatLog(logError, "Failed to get uptime: %s", err.Erro r()) 160 logcatLog(logError, "Failed to get uptime: %s", err.Erro r())
153 return 1 161 return 1
154 } 162 }
163 if consecutiveTimeouts >= maxTimeouts {
164 logcatLog(logError, "%d consective timeouts when fetchin g uptime. Triggering reboot", consecutiveTimeouts)
165 break
166 }
167 if consecutiveTimeouts > 0 {
168 logcatLog(logError, "Timeout when fetching uptime. Sleep ing for 60s and trying again.")
169 time.Sleep(60 * time.Second)
170 continue
171 }
155 172
156 if uptime > maxUptime { 173 if uptime > maxUptime {
157 logcatLog(logInfo, "Max uptime exceeded: (%s > %s)\n", u ptime, maxUptime) 174 logcatLog(logInfo, "Max uptime exceeded: (%s > %s)\n", u ptime, maxUptime)
158 break 175 break
159 } 176 }
160 logcatLog(logInfo, "No need to reboot, uptime < max_uptime: (%s < %s)\n", uptime, maxUptime) 177 logcatLog(logInfo, "No need to reboot, uptime < max_uptime: (%s < %s)\n", uptime, maxUptime)
178 // Add an additional second to the sleep to ensure it doesn't
179 // sleep several times in less than a second.
161 time.Sleep(maxUptime - uptime + time.Second) 180 time.Sleep(maxUptime - uptime + time.Second)
162 } 181 }
163 » if err = rebootDevice(); err != nil { 182 » if err := rebootDevice(); err != nil {
164 logcatLog(logError, "Failed to reboot device: %s", err.Error()) 183 logcatLog(logError, "Failed to reboot device: %s", err.Error())
165 return 1 184 return 1
166 } 185 }
167 return 0 186 return 0
168 } 187 }
169 188
170 func main() { 189 func main() {
171 paniccatcher.Do(func() { 190 paniccatcher.Do(func() {
172 os.Exit(realMain()) 191 os.Exit(realMain())
173 }, func(p *paniccatcher.Panic) { 192 }, func(p *paniccatcher.Panic) {
174 logcatLog(logError, "Panic: %s\n%s", p.Reason, p.Stack) 193 logcatLog(logError, "Panic: %s\n%s", p.Reason, p.Stack)
175 os.Exit(1) 194 os.Exit(1)
176 }) 195 })
177 } 196 }
OLDNEW
« no previous file with comments | « go/deps.yaml ('k') | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698