Chromium Code Reviews| OLD | NEW |
|---|---|
| 1 // Copyright 2016 The Chromium Authors. All rights reserved. | 1 // Copyright 2016 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 | 4 |
| 5 // +build android | 5 // +build android |
| 6 | 6 |
| 7 // Watchdog daemon for android devices. It will attempt to reboot the device | 7 // Watchdog daemon for android devices. It will attempt to reboot the device |
| 8 // if its uptime exceeds a specified maximum. | 8 // if its uptime exceeds a specified maximum. |
| 9 package main | 9 package main |
| 10 | 10 |
| 11 /* | 11 /* |
| 12 #cgo LDFLAGS: -landroid -llog | 12 #cgo LDFLAGS: -landroid -llog |
| 13 | 13 |
| 14 #include <android/log.h> | 14 #include <android/log.h> |
| 15 #include <string.h> | 15 #include <string.h> |
| 16 */ | 16 */ |
| 17 import "C" | 17 import "C" |
| 18 | 18 |
| 19 import ( | 19 import ( |
| 20 "errors" | |
| 20 "flag" | 21 "flag" |
| 21 "fmt" | 22 "fmt" |
| 22 "io/ioutil" | 23 "io/ioutil" |
| 23 "os" | 24 "os" |
| 25 "os/exec" | |
| 24 "strconv" | 26 "strconv" |
| 25 "strings" | 27 "strings" |
| 26 "syscall" | 28 "syscall" |
| 27 "time" | 29 "time" |
| 28 "unsafe" | 30 "unsafe" |
| 29 | 31 |
| 30 "github.com/luci/luci-go/common/runtime/paniccatcher" | 32 "github.com/luci/luci-go/common/runtime/paniccatcher" |
| 31 ) | 33 ) |
| 32 | 34 |
| 33 var ( | 35 var ( |
| 34 » logHeader = C.CString("CIT_DeviceWatchdog") | 36 » logHeader = C.CString("CIT_DeviceWatchdog") |
| 37 » errTimeout = errors.New("timeout") | |
| 38 ) | |
| 39 | |
| 40 const ( | |
| 41 » stdInFd = 0 | |
| 42 » stdOutFd = 1 | |
| 43 » stdErrFd = 2 | |
| 35 ) | 44 ) |
| 36 | 45 |
| 37 type logLevel int | 46 type logLevel int |
| 38 | 47 |
| 39 const ( | 48 const ( |
| 40 logInfo = iota | 49 logInfo = iota |
| 41 logWarning | 50 logWarning |
| 42 logError | 51 logError |
| 43 ) | 52 ) |
| 44 | 53 |
| 45 const ( | |
| 46 stdInFd = 0 | |
| 47 stdOutFd = 1 | |
| 48 stdErrFd = 2 | |
| 49 ) | |
| 50 | |
| 51 func (l logLevel) getLogLevel() C.int { | 54 func (l logLevel) getLogLevel() C.int { |
| 52 switch l { | 55 switch l { |
| 53 case logInfo: | 56 case logInfo: |
| 54 return C.ANDROID_LOG_INFO | 57 return C.ANDROID_LOG_INFO |
| 55 case logWarning: | 58 case logWarning: |
| 56 return C.ANDROID_LOG_WARN | 59 return C.ANDROID_LOG_WARN |
| 57 case logError: | 60 case logError: |
| 58 return C.ANDROID_LOG_ERROR | 61 return C.ANDROID_LOG_ERROR |
| 59 default: | 62 default: |
| 60 panic("Unknown log level.") | 63 panic("Unknown log level.") |
| 61 } | 64 } |
| 62 } | 65 } |
| 63 | 66 |
| 64 func logcatLog(level logLevel, format string, args ...interface{}) { | 67 func logcatLog(level logLevel, format string, args ...interface{}) { |
| 65 cmsg := C.CString(fmt.Sprintf(format, args...)) | 68 cmsg := C.CString(fmt.Sprintf(format, args...)) |
| 66 defer C.free(unsafe.Pointer(cmsg)) | 69 defer C.free(unsafe.Pointer(cmsg)) |
| 67 C.__android_log_write(level.getLogLevel(), logHeader, cmsg) | 70 C.__android_log_write(level.getLogLevel(), logHeader, cmsg) |
| 68 } | 71 } |
| 69 | 72 |
| 70 // Spawn a child process via fork, create new process group, chdir and | 73 // Spawn a child process via exec. |
| 71 // redirect std in and out to /dev/null. | 74 func daemonize(maxUptime int) (int, error) { |
|
dnj
2016/09/03 00:48:07
WDYT about having daemonize take an args slice ins
bpastene
2016/09/09 00:51:43
Done, but I'd rather replace argv[0] with /proc/se
| |
| 72 func daemonize() (int, error) { | |
| 73 ret, _, errno := syscall.Syscall(syscall.SYS_FORK, 0, 0, 0) | 75 ret, _, errno := syscall.Syscall(syscall.SYS_FORK, 0, 0, 0) |
|
dnj
2016/09/03 00:48:07
Forking here may be a little sketchy. In other dae
bpastene
2016/09/09 00:51:43
I was modeling it after https://chromium.googlesou
bpastene
2016/09/09 19:38:53
Nevermind. This is now all totally obsolete since
| |
| 74 pid := int(ret) | 76 pid := int(ret) |
| 75 if errno != 0 { | 77 if errno != 0 { |
| 76 return 0, errno | 78 return 0, errno |
| 77 } | 79 } |
| 78 if pid > 0 { | 80 if pid > 0 { |
| 79 return pid, nil | 81 return pid, nil |
| 80 } | 82 } |
| 81 | |
| 82 _, err := syscall.Setsid() | 83 _, err := syscall.Setsid() |
| 83 if err != nil { | 84 if err != nil { |
| 84 return 0, err | 85 return 0, err |
| 85 } | 86 } |
| 86 | |
| 87 f, err := os.Open("/dev/null") | 87 f, err := os.Open("/dev/null") |
| 88 if err != nil { | 88 if err != nil { |
| 89 return 0, err | 89 return 0, err |
| 90 } | 90 } |
| 91 fd := f.Fd() | 91 fd := f.Fd() |
| 92 syscall.Dup2(int(fd), stdInFd) | 92 syscall.Dup2(int(fd), stdInFd) |
| 93 syscall.Dup2(int(fd), stdOutFd) | 93 syscall.Dup2(int(fd), stdOutFd) |
| 94 syscall.Dup2(int(fd), stdErrFd) | 94 syscall.Dup2(int(fd), stdErrFd) |
| 95 | 95 |
| 96 » return pid, nil | 96 » binary, err := os.Readlink("/proc/self/exe") |
| 97 » if err != nil { | |
| 98 » » return 0, err | |
| 99 » } | |
| 100 » cmd := exec.Command(binary, "--max-uptime", strconv.Itoa(maxUptime), "-- skip-daemonize") | |
| 101 » err = cmd.Start() | |
|
dnj
2016/09/03 00:48:07
You can avoid some junk by using "os.StartProcess"
bpastene
2016/09/09 00:51:43
I'm not sure what junk I'm avoiding, but done. (I
| |
| 102 » if err != nil { | |
| 103 » » return 0, err | |
| 104 » } | |
| 105 » return cmd.Process.Pid, nil | |
| 106 } | |
| 107 | |
| 108 type uptimeResult struct { | |
| 109 » Uptime time.Duration | |
| 110 » Err error | |
| 97 } | 111 } |
| 98 | 112 |
| 99 // Read from /proc/uptime. Expected format: | 113 // Read from /proc/uptime. Expected format: |
| 100 // "uptime_in_seconds cpu_idle_time_in_seconds" | 114 // "uptime_in_seconds cpu_idle_time_in_seconds" |
| 101 func getDeviceUptime() (time.Duration, error) { | 115 // Return the uptime via a channel for use with timeouts. |
| 116 func readUptime() (time.Duration, error) { | |
| 102 bytes, err := ioutil.ReadFile("/proc/uptime") | 117 bytes, err := ioutil.ReadFile("/proc/uptime") |
| 103 if err != nil { | 118 if err != nil { |
| 104 return 0, fmt.Errorf("unable to open /proc/uptime: %s", err.Erro r()) | 119 return 0, fmt.Errorf("unable to open /proc/uptime: %s", err.Erro r()) |
| 105 } | 120 } |
| 106 // Split on the space to get uptime and drop cpu idle time. | 121 // Split on the space to get uptime and drop cpu idle time. |
| 107 uptimeFields := strings.Fields(string(bytes)) | 122 uptimeFields := strings.Fields(string(bytes)) |
| 108 if len(uptimeFields) == 0 { | 123 if len(uptimeFields) == 0 { |
| 109 return 0, fmt.Errorf("unable to parse /proc/uptime") | 124 return 0, fmt.Errorf("unable to parse /proc/uptime") |
| 110 } | 125 } |
| 111 uptime, err := strconv.ParseFloat(uptimeFields[0], 64) | 126 uptime, err := strconv.ParseFloat(uptimeFields[0], 64) |
| 112 if err != nil { | 127 if err != nil { |
| 113 return 0, fmt.Errorf("unable to parse uptime: %s", err.Error()) | 128 return 0, fmt.Errorf("unable to parse uptime: %s", err.Error()) |
| 114 } | 129 } |
| 115 return time.Duration(uptime * float64(time.Second)), nil | 130 return time.Duration(uptime * float64(time.Second)), nil |
| 116 } | 131 } |
| 117 | 132 |
| 133 func getUptime(requestQueue chan<- chan uptimeResult, timeoutPeriod time.Duratio n) (time.Duration, error) { | |
|
dnj
2016/09/03 00:48:07
nit: chan<- chan<- uptimeResult
In other words, a
bpastene
2016/09/09 00:51:43
Done, but aren't I reading from those channels lat
dnj
2016/09/09 19:40:57
Yeah a bidirectional channel can be converted into
| |
| 134 request := make(chan uptimeResult, 1) | |
| 135 defer close(request) | |
| 136 | |
| 137 timer := time.NewTimer(timeoutPeriod) | |
| 138 defer timer.Stop() | |
| 139 | |
| 140 select { | |
| 141 case requestQueue <- request: | |
| 142 break | |
| 143 case <-timer.C: | |
| 144 return 0, errTimeout | |
| 145 } | |
| 146 | |
| 147 select { | |
| 148 case resp := <-request: | |
| 149 return resp.Uptime, resp.Err | |
| 150 case <-timer.C: | |
| 151 return 0, errTimeout | |
| 152 } | |
| 153 } | |
| 154 | |
| 118 // Reboot device by writing to sysrq-trigger. See: | 155 // Reboot device by writing to sysrq-trigger. See: |
| 119 // https://www.kernel.org/doc/Documentation/sysrq.txt | 156 // https://www.kernel.org/doc/Documentation/sysrq.txt |
| 120 func rebootDevice() error { | 157 func rebootDevice() error { |
| 121 fd, err := os.OpenFile("/proc/sysrq-trigger", os.O_WRONLY, 0) | 158 fd, err := os.OpenFile("/proc/sysrq-trigger", os.O_WRONLY, 0) |
| 122 if err != nil { | 159 if err != nil { |
| 123 return fmt.Errorf("Can't open /proc/sysrq-trigger: %s", err.Erro r()) | 160 return fmt.Errorf("Can't open /proc/sysrq-trigger: %s", err.Erro r()) |
| 124 } | 161 } |
| 125 defer fd.Close() | 162 defer fd.Close() |
| 126 _, err = fd.Write([]byte("b")) | 163 _, err = fd.Write([]byte("b")) |
| 127 if err != nil { | 164 if err != nil { |
| 128 return fmt.Errorf("Can't reboot: %s", err.Error()) | 165 return fmt.Errorf("Can't reboot: %s", err.Error()) |
| 129 } | 166 } |
| 130 return fmt.Errorf("I just rebooted. How am I still alive?!?\n") | 167 return fmt.Errorf("I just rebooted. How am I still alive?!?\n") |
| 131 } | 168 } |
| 132 | 169 |
| 133 func realMain() int { | 170 func realMain() int { |
| 134 maxUptimeFlag := flag.Int("max-uptime", 120, "Maximum uptime in minutes before a reboot is triggered.") | 171 maxUptimeFlag := flag.Int("max-uptime", 120, "Maximum uptime in minutes before a reboot is triggered.") |
| 172 skipDaemonizeFlag := flag.Bool("skip-daemonize", false, "Skips the daemo nize logic. Otherwise it will spawn a copy daemon and exit.") | |
| 135 flag.Parse() | 173 flag.Parse() |
| 136 | 174 |
| 137 » os.Chdir("/") | 175 » if !*skipDaemonizeFlag { |
| 138 » pid, err := daemonize() | 176 » » pid, err := daemonize(*maxUptimeFlag) |
| 139 » if err != nil { | 177 » » if err != nil { |
| 140 » » logcatLog(logError, "Failed to daemonize: %s", err.Error()) | 178 » » » logcatLog(logError, "Failed to daemonize: %s", err.Error ()) |
| 141 » » return 1 | 179 » » » return 1 |
| 142 » } | 180 » » } |
| 143 » if pid > 0 { | |
| 144 logcatLog(logInfo, "Child spawned with pid %d, exiting parent\n" , pid) | 181 logcatLog(logInfo, "Child spawned with pid %d, exiting parent\n" , pid) |
| 145 return 0 | 182 return 0 |
| 146 } | 183 } |
| 147 | 184 |
| 185 requestQueue := make(chan chan uptimeResult) | |
| 186 go func() { | |
| 187 for request := range requestQueue { | |
| 188 uptime, err := readUptime() | |
| 189 request <- uptimeResult{Uptime: uptime, Err: err} | |
| 190 } | |
| 191 }() | |
| 192 defer close(requestQueue) | |
| 193 | |
| 148 maxUptime := time.Duration(*maxUptimeFlag) * time.Minute | 194 maxUptime := time.Duration(*maxUptimeFlag) * time.Minute |
| 195 consecutiveTimeouts := 0 | |
| 196 maxTimeouts := 5 | |
|
dnj
2016/09/03 00:48:07
nit: "const maxTimeouts = 5"
bpastene
2016/09/09 00:51:43
Done.
| |
| 149 for { | 197 for { |
| 150 » » uptime, err := getDeviceUptime() | 198 » » uptime, err := getUptime(requestQueue, 5*time.Second) |
| 151 » » if err != nil { | 199 » » switch err { |
| 200 » » case nil: | |
| 201 » » » consecutiveTimeouts = 0 | |
| 202 » » case errTimeout: | |
| 203 » » » consecutiveTimeouts++ | |
| 204 » » default: | |
| 152 logcatLog(logError, "Failed to get uptime: %s", err.Erro r()) | 205 logcatLog(logError, "Failed to get uptime: %s", err.Erro r()) |
| 153 return 1 | 206 return 1 |
| 154 } | 207 } |
| 208 if consecutiveTimeouts == maxTimeouts { | |
|
dnj
2016/09/03 00:48:07
nit: Just being paranoid, but might as well make t
bpastene
2016/09/09 00:51:43
Done.
| |
| 209 logcatLog(logError, "%d consective timeouts when fetchin g uptime. Triggering reboot", maxTimeouts) | |
|
dnj
2016/09/03 00:48:07
nit: currentTimeouts, not maxTimeouts.
bpastene
2016/09/09 00:51:43
Done.
| |
| 210 break | |
| 211 } else if consecutiveTimeouts > 0 { | |
|
dnj
2016/09/03 00:48:07
nit: No need for "else if", since the previous sta
bpastene
2016/09/09 00:51:43
Done.
| |
| 212 logcatLog(logError, "Timeout when fetching uptime. Sleep ing for 60s and trying again.") | |
| 213 time.Sleep(60 * time.Second) | |
| 214 continue | |
| 215 } | |
| 155 | 216 |
| 156 if uptime > maxUptime { | 217 if uptime > maxUptime { |
| 157 logcatLog(logInfo, "Max uptime exceeded: (%s > %s)\n", u ptime, maxUptime) | 218 logcatLog(logInfo, "Max uptime exceeded: (%s > %s)\n", u ptime, maxUptime) |
| 158 break | 219 break |
| 159 } | 220 } |
| 160 logcatLog(logInfo, "No need to reboot, uptime < max_uptime: (%s < %s)\n", uptime, maxUptime) | 221 logcatLog(logInfo, "No need to reboot, uptime < max_uptime: (%s < %s)\n", uptime, maxUptime) |
| 222 // Add an additional second to the sleep to ensure it doesn't | |
| 223 // sleep several times in less than a second. | |
| 161 time.Sleep(maxUptime - uptime + time.Second) | 224 time.Sleep(maxUptime - uptime + time.Second) |
| 162 } | 225 } |
| 163 » if err = rebootDevice(); err != nil { | 226 » if err := rebootDevice(); err != nil { |
| 164 logcatLog(logError, "Failed to reboot device: %s", err.Error()) | 227 logcatLog(logError, "Failed to reboot device: %s", err.Error()) |
| 165 return 1 | 228 return 1 |
| 166 } | 229 } |
| 167 return 0 | 230 return 0 |
| 168 } | 231 } |
| 169 | 232 |
| 170 func main() { | 233 func main() { |
| 171 paniccatcher.Do(func() { | 234 paniccatcher.Do(func() { |
| 172 os.Exit(realMain()) | 235 os.Exit(realMain()) |
| 173 }, func(p *paniccatcher.Panic) { | 236 }, func(p *paniccatcher.Panic) { |
| 174 logcatLog(logError, "Panic: %s\n%s", p.Reason, p.Stack) | 237 logcatLog(logError, "Panic: %s\n%s", p.Reason, p.Stack) |
| 175 os.Exit(1) | 238 os.Exit(1) |
| 176 }) | 239 }) |
| 177 } | 240 } |
| OLD | NEW |