| OLD | NEW |
| 1 // Copyright 2016 The Chromium Authors. All rights reserved. | 1 // Copyright 2016 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 | 4 |
| 5 // +build android | 5 // +build android |
| 6 | 6 |
| 7 // Watchdog daemon for android devices. It will attempt to reboot the device | 7 // Watchdog daemon for android devices. It will attempt to reboot the device |
| 8 // if its uptime exceeds a specified maximum. | 8 // if its uptime exceeds a specified maximum. |
| 9 package main | 9 package main |
| 10 | 10 |
| 11 /* | 11 /* |
| 12 #cgo LDFLAGS: -landroid -llog | 12 #cgo LDFLAGS: -landroid -llog |
| 13 | 13 |
| 14 #include <android/log.h> | 14 #include <android/log.h> |
| 15 #include <string.h> | 15 #include <string.h> |
| 16 */ | 16 */ |
| 17 import "C" | 17 import "C" |
| 18 | 18 |
| 19 import ( | 19 import ( |
| 20 "errors" |
| 20 "flag" | 21 "flag" |
| 21 "fmt" | 22 "fmt" |
| 22 "io/ioutil" | 23 "io/ioutil" |
| 23 "os" | 24 "os" |
| 24 "strconv" | 25 "strconv" |
| 25 "strings" | 26 "strings" |
| 26 "syscall" | |
| 27 "time" | 27 "time" |
| 28 "unsafe" | 28 "unsafe" |
| 29 | 29 |
| 30 "github.com/VividCortex/godaemon" |
| 30 "github.com/luci/luci-go/common/runtime/paniccatcher" | 31 "github.com/luci/luci-go/common/runtime/paniccatcher" |
| 31 ) | 32 ) |
| 32 | 33 |
| 33 var ( | 34 var ( |
| 34 » logHeader = C.CString("CIT_DeviceWatchdog") | 35 » logHeader = C.CString("CIT_DeviceWatchdog") |
| 36 » errTimeout = errors.New("timeout") |
| 37 ) |
| 38 |
| 39 const ( |
| 40 » stdInFd = 0 |
| 41 » stdOutFd = 1 |
| 42 » stdErrFd = 2 |
| 35 ) | 43 ) |
| 36 | 44 |
| 37 type logLevel int | 45 type logLevel int |
| 38 | 46 |
| 39 const ( | 47 const ( |
| 40 logInfo = iota | 48 logInfo = iota |
| 41 logWarning | 49 logWarning |
| 42 logError | 50 logError |
| 43 ) | 51 ) |
| 44 | 52 |
| 45 const ( | |
| 46 stdInFd = 0 | |
| 47 stdOutFd = 1 | |
| 48 stdErrFd = 2 | |
| 49 ) | |
| 50 | |
| 51 func (l logLevel) getLogLevel() C.int { | 53 func (l logLevel) getLogLevel() C.int { |
| 52 switch l { | 54 switch l { |
| 53 case logInfo: | 55 case logInfo: |
| 54 return C.ANDROID_LOG_INFO | 56 return C.ANDROID_LOG_INFO |
| 55 case logWarning: | 57 case logWarning: |
| 56 return C.ANDROID_LOG_WARN | 58 return C.ANDROID_LOG_WARN |
| 57 case logError: | 59 case logError: |
| 58 return C.ANDROID_LOG_ERROR | 60 return C.ANDROID_LOG_ERROR |
| 59 default: | 61 default: |
| 60 panic("Unknown log level.") | 62 panic("Unknown log level.") |
| 61 } | 63 } |
| 62 } | 64 } |
| 63 | 65 |
| 64 func logcatLog(level logLevel, format string, args ...interface{}) { | 66 func logcatLog(level logLevel, format string, args ...interface{}) { |
| 65 cmsg := C.CString(fmt.Sprintf(format, args...)) | 67 cmsg := C.CString(fmt.Sprintf(format, args...)) |
| 66 defer C.free(unsafe.Pointer(cmsg)) | 68 defer C.free(unsafe.Pointer(cmsg)) |
| 67 C.__android_log_write(level.getLogLevel(), logHeader, cmsg) | 69 C.__android_log_write(level.getLogLevel(), logHeader, cmsg) |
| 68 } | 70 } |
| 69 | 71 |
| 70 // Spawn a child process via fork, create new process group, chdir and | 72 type uptimeResult struct { |
| 71 // redirect std in and out to /dev/null. | 73 » Uptime time.Duration |
| 72 func daemonize() (int, error) { | 74 » Err error |
| 73 » ret, _, errno := syscall.Syscall(syscall.SYS_FORK, 0, 0, 0) | |
| 74 » pid := int(ret) | |
| 75 » if errno != 0 { | |
| 76 » » return 0, errno | |
| 77 » } | |
| 78 » if pid > 0 { | |
| 79 » » return pid, nil | |
| 80 » } | |
| 81 | |
| 82 » _, err := syscall.Setsid() | |
| 83 » if err != nil { | |
| 84 » » return 0, err | |
| 85 » } | |
| 86 | |
| 87 » f, err := os.Open("/dev/null") | |
| 88 » if err != nil { | |
| 89 » » return 0, err | |
| 90 » } | |
| 91 » fd := f.Fd() | |
| 92 » syscall.Dup2(int(fd), stdInFd) | |
| 93 » syscall.Dup2(int(fd), stdOutFd) | |
| 94 » syscall.Dup2(int(fd), stdErrFd) | |
| 95 | |
| 96 » return pid, nil | |
| 97 } | 75 } |
| 98 | 76 |
| 99 // Read from /proc/uptime. Expected format: | 77 // Read from /proc/uptime. Expected format: |
| 100 // "uptime_in_seconds cpu_idle_time_in_seconds" | 78 // "uptime_in_seconds cpu_idle_time_in_seconds" |
| 101 func getDeviceUptime() (time.Duration, error) { | 79 // Return the uptime via a channel for use with timeouts. |
| 80 func readUptime() (time.Duration, error) { |
| 102 bytes, err := ioutil.ReadFile("/proc/uptime") | 81 bytes, err := ioutil.ReadFile("/proc/uptime") |
| 103 if err != nil { | 82 if err != nil { |
| 104 return 0, fmt.Errorf("unable to open /proc/uptime: %s", err.Erro
r()) | 83 return 0, fmt.Errorf("unable to open /proc/uptime: %s", err.Erro
r()) |
| 105 } | 84 } |
| 106 // Split on the space to get uptime and drop cpu idle time. | 85 // Split on the space to get uptime and drop cpu idle time. |
| 107 uptimeFields := strings.Fields(string(bytes)) | 86 uptimeFields := strings.Fields(string(bytes)) |
| 108 if len(uptimeFields) == 0 { | 87 if len(uptimeFields) == 0 { |
| 109 return 0, fmt.Errorf("unable to parse /proc/uptime") | 88 return 0, fmt.Errorf("unable to parse /proc/uptime") |
| 110 } | 89 } |
| 111 uptime, err := strconv.ParseFloat(uptimeFields[0], 64) | 90 uptime, err := strconv.ParseFloat(uptimeFields[0], 64) |
| 112 if err != nil { | 91 if err != nil { |
| 113 return 0, fmt.Errorf("unable to parse uptime: %s", err.Error()) | 92 return 0, fmt.Errorf("unable to parse uptime: %s", err.Error()) |
| 114 } | 93 } |
| 115 return time.Duration(uptime * float64(time.Second)), nil | 94 return time.Duration(uptime * float64(time.Second)), nil |
| 116 } | 95 } |
| 117 | 96 |
| 97 func getUptime(requestQueue chan<- chan<- uptimeResult, timeoutPeriod time.Durat
ion) (time.Duration, error) { |
| 98 request := make(chan uptimeResult, 1) |
| 99 defer close(request) |
| 100 |
| 101 timer := time.NewTimer(timeoutPeriod) |
| 102 defer timer.Stop() |
| 103 |
| 104 select { |
| 105 case requestQueue <- request: |
| 106 break |
| 107 case <-timer.C: |
| 108 return 0, errTimeout |
| 109 } |
| 110 |
| 111 select { |
| 112 case resp := <-request: |
| 113 return resp.Uptime, resp.Err |
| 114 case <-timer.C: |
| 115 return 0, errTimeout |
| 116 } |
| 117 } |
| 118 |
| 118 // Reboot device by writing to sysrq-trigger. See: | 119 // Reboot device by writing to sysrq-trigger. See: |
| 119 // https://www.kernel.org/doc/Documentation/sysrq.txt | 120 // https://www.kernel.org/doc/Documentation/sysrq.txt |
| 120 func rebootDevice() error { | 121 func rebootDevice() error { |
| 121 fd, err := os.OpenFile("/proc/sysrq-trigger", os.O_WRONLY, 0) | 122 fd, err := os.OpenFile("/proc/sysrq-trigger", os.O_WRONLY, 0) |
| 122 if err != nil { | 123 if err != nil { |
| 123 return fmt.Errorf("Can't open /proc/sysrq-trigger: %s", err.Erro
r()) | 124 return fmt.Errorf("Can't open /proc/sysrq-trigger: %s", err.Erro
r()) |
| 124 } | 125 } |
| 125 defer fd.Close() | 126 defer fd.Close() |
| 126 _, err = fd.Write([]byte("b")) | 127 _, err = fd.Write([]byte("b")) |
| 127 if err != nil { | 128 if err != nil { |
| 128 return fmt.Errorf("Can't reboot: %s", err.Error()) | 129 return fmt.Errorf("Can't reboot: %s", err.Error()) |
| 129 } | 130 } |
| 130 return fmt.Errorf("I just rebooted. How am I still alive?!?\n") | 131 return fmt.Errorf("I just rebooted. How am I still alive?!?\n") |
| 131 } | 132 } |
| 132 | 133 |
| 133 func realMain() int { | 134 func realMain() int { |
| 135 godaemon.MakeDaemon(&godaemon.DaemonAttr{}) |
| 136 |
| 134 maxUptimeFlag := flag.Int("max-uptime", 120, "Maximum uptime in minutes
before a reboot is triggered.") | 137 maxUptimeFlag := flag.Int("max-uptime", 120, "Maximum uptime in minutes
before a reboot is triggered.") |
| 135 flag.Parse() | 138 flag.Parse() |
| 136 | 139 |
| 137 » os.Chdir("/") | 140 » requestQueue := make(chan chan<- uptimeResult) |
| 138 » pid, err := daemonize() | 141 » go func() { |
| 139 » if err != nil { | 142 » » for request := range requestQueue { |
| 140 » » logcatLog(logError, "Failed to daemonize: %s", err.Error()) | 143 » » » uptime, err := readUptime() |
| 141 » » return 1 | 144 » » » request <- uptimeResult{Uptime: uptime, Err: err} |
| 142 » } | 145 » » } |
| 143 » if pid > 0 { | 146 » }() |
| 144 » » logcatLog(logInfo, "Child spawned with pid %d, exiting parent\n"
, pid) | 147 » defer close(requestQueue) |
| 145 » » return 0 | |
| 146 » } | |
| 147 | 148 |
| 148 maxUptime := time.Duration(*maxUptimeFlag) * time.Minute | 149 maxUptime := time.Duration(*maxUptimeFlag) * time.Minute |
| 150 consecutiveTimeouts := 0 |
| 151 const maxTimeouts = 5 |
| 149 for { | 152 for { |
| 150 » » uptime, err := getDeviceUptime() | 153 » » uptime, err := getUptime(requestQueue, 5*time.Second) |
| 151 » » if err != nil { | 154 » » switch err { |
| 155 » » case nil: |
| 156 » » » consecutiveTimeouts = 0 |
| 157 » » case errTimeout: |
| 158 » » » consecutiveTimeouts++ |
| 159 » » default: |
| 152 logcatLog(logError, "Failed to get uptime: %s", err.Erro
r()) | 160 logcatLog(logError, "Failed to get uptime: %s", err.Erro
r()) |
| 153 return 1 | 161 return 1 |
| 154 } | 162 } |
| 163 if consecutiveTimeouts >= maxTimeouts { |
| 164 logcatLog(logError, "%d consective timeouts when fetchin
g uptime. Triggering reboot", consecutiveTimeouts) |
| 165 break |
| 166 } |
| 167 if consecutiveTimeouts > 0 { |
| 168 logcatLog(logError, "Timeout when fetching uptime. Sleep
ing for 60s and trying again.") |
| 169 time.Sleep(60 * time.Second) |
| 170 continue |
| 171 } |
| 155 | 172 |
| 156 if uptime > maxUptime { | 173 if uptime > maxUptime { |
| 157 logcatLog(logInfo, "Max uptime exceeded: (%s > %s)\n", u
ptime, maxUptime) | 174 logcatLog(logInfo, "Max uptime exceeded: (%s > %s)\n", u
ptime, maxUptime) |
| 158 break | 175 break |
| 159 } | 176 } |
| 160 logcatLog(logInfo, "No need to reboot, uptime < max_uptime: (%s
< %s)\n", uptime, maxUptime) | 177 logcatLog(logInfo, "No need to reboot, uptime < max_uptime: (%s
< %s)\n", uptime, maxUptime) |
| 178 // Add an additional second to the sleep to ensure it doesn't |
| 179 // sleep several times in less than a second. |
| 161 time.Sleep(maxUptime - uptime + time.Second) | 180 time.Sleep(maxUptime - uptime + time.Second) |
| 162 } | 181 } |
| 163 » if err = rebootDevice(); err != nil { | 182 » if err := rebootDevice(); err != nil { |
| 164 logcatLog(logError, "Failed to reboot device: %s", err.Error()) | 183 logcatLog(logError, "Failed to reboot device: %s", err.Error()) |
| 165 return 1 | 184 return 1 |
| 166 } | 185 } |
| 167 return 0 | 186 return 0 |
| 168 } | 187 } |
| 169 | 188 |
| 170 func main() { | 189 func main() { |
| 171 paniccatcher.Do(func() { | 190 paniccatcher.Do(func() { |
| 172 os.Exit(realMain()) | 191 os.Exit(realMain()) |
| 173 }, func(p *paniccatcher.Panic) { | 192 }, func(p *paniccatcher.Panic) { |
| 174 logcatLog(logError, "Panic: %s\n%s", p.Reason, p.Stack) | 193 logcatLog(logError, "Panic: %s\n%s", p.Reason, p.Stack) |
| 175 os.Exit(1) | 194 os.Exit(1) |
| 176 }) | 195 }) |
| 177 } | 196 } |
| OLD | NEW |