Chromium Code Reviews| OLD | NEW |
|---|---|
| 1 // Copyright 2016 The Chromium Authors. All rights reserved. | 1 // Copyright 2016 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 | 4 |
| 5 // +build android | 5 // +build android |
| 6 | 6 |
| 7 // Watchdog daemon for android devices. It will attempt to reboot the device | 7 // Watchdog daemon for android devices. It will attempt to reboot the device |
| 8 // if its uptime exceeds a specified maximum. | 8 // if its uptime exceeds a specified maximum. |
| 9 package main | 9 package main |
| 10 | 10 |
| 11 /* | 11 /* |
| 12 #cgo LDFLAGS: -landroid -llog | 12 #cgo LDFLAGS: -landroid -llog |
| 13 | 13 |
| 14 #include <android/log.h> | 14 #include <android/log.h> |
| 15 #include <string.h> | 15 #include <string.h> |
| 16 */ | 16 */ |
| 17 import "C" | 17 import "C" |
| 18 | 18 |
| 19 import ( | 19 import ( |
| 20 "flag" | 20 "flag" |
| 21 "fmt" | 21 "fmt" |
| 22 "io/ioutil" | 22 "io/ioutil" |
| 23 "os" | 23 "os" |
| 24 "os/exec" | |
| 24 "strconv" | 25 "strconv" |
| 25 "strings" | 26 "strings" |
| 26 "syscall" | |
| 27 "time" | 27 "time" |
| 28 "unsafe" | 28 "unsafe" |
| 29 | 29 |
| 30 "github.com/luci/luci-go/common/runtime/paniccatcher" | 30 "github.com/luci/luci-go/common/runtime/paniccatcher" |
| 31 ) | 31 ) |
| 32 | 32 |
| 33 var ( | 33 var ( |
| 34 logHeader = C.CString("CIT_DeviceWatchdog") | 34 logHeader = C.CString("CIT_DeviceWatchdog") |
| 35 ) | 35 ) |
| 36 | 36 |
| 37 type logLevel int | 37 type logLevel int |
| 38 | 38 |
| 39 const ( | 39 const ( |
| 40 logInfo = iota | 40 logInfo = iota |
| 41 logWarning | 41 logWarning |
| 42 logError | 42 logError |
| 43 ) | 43 ) |
| 44 | 44 |
| 45 const ( | |
| 46 stdInFd = 0 | |
| 47 stdOutFd = 1 | |
| 48 stdErrFd = 2 | |
| 49 ) | |
| 50 | |
| 51 func (l logLevel) getLogLevel() C.int { | 45 func (l logLevel) getLogLevel() C.int { |
| 52 switch l { | 46 switch l { |
| 53 case logInfo: | 47 case logInfo: |
| 54 return C.ANDROID_LOG_INFO | 48 return C.ANDROID_LOG_INFO |
| 55 case logWarning: | 49 case logWarning: |
| 56 return C.ANDROID_LOG_WARN | 50 return C.ANDROID_LOG_WARN |
| 57 case logError: | 51 case logError: |
| 58 return C.ANDROID_LOG_ERROR | 52 return C.ANDROID_LOG_ERROR |
| 59 default: | 53 default: |
| 60 panic("Unknown log level.") | 54 panic("Unknown log level.") |
| 61 } | 55 } |
| 62 } | 56 } |
| 63 | 57 |
| 64 func logcatLog(level logLevel, format string, args ...interface{}) { | 58 func logcatLog(level logLevel, format string, args ...interface{}) { |
| 65 cmsg := C.CString(fmt.Sprintf(format, args...)) | 59 cmsg := C.CString(fmt.Sprintf(format, args...)) |
| 66 defer C.free(unsafe.Pointer(cmsg)) | 60 defer C.free(unsafe.Pointer(cmsg)) |
| 67 C.__android_log_write(level.getLogLevel(), logHeader, cmsg) | 61 C.__android_log_write(level.getLogLevel(), logHeader, cmsg) |
| 68 } | 62 } |
| 69 | 63 |
| 70 // Spawn a child process via fork, create new process group, chdir and | 64 // Spawn a child process via exec. |
| 71 // redirect std in and out to /dev/null. | 65 func daemonize(maxUptime int) (int, error) { |
|
dnj
2016/09/02 16:17:32
This isn't a full daemonize workflow. It doesn't s
bpastene
2016/09/02 23:02:13
There were problems with threads not correctly get
| |
| 72 func daemonize() (int, error) { | 66 » binary, err := os.Readlink("/proc/self/exe") |
| 73 » ret, _, errno := syscall.Syscall(syscall.SYS_FORK, 0, 0, 0) | |
| 74 » pid := int(ret) | |
| 75 » if errno != 0 { | |
| 76 » » return 0, errno | |
| 77 » } | |
| 78 » if pid > 0 { | |
| 79 » » return pid, nil | |
| 80 » } | |
| 81 | |
| 82 » _, err := syscall.Setsid() | |
| 83 if err != nil { | 67 if err != nil { |
| 84 return 0, err | 68 return 0, err |
| 85 } | 69 } |
| 86 | 70 » cmd := exec.Command(binary, "--max-uptime", strconv.Itoa(maxUptime), "-- skip-daemonize") |
| 87 » f, err := os.Open("/dev/null") | 71 » err = cmd.Start() |
| 88 if err != nil { | 72 if err != nil { |
| 89 return 0, err | 73 return 0, err |
| 90 } | 74 } |
| 91 » fd := f.Fd() | 75 » return cmd.Process.Pid, nil |
| 92 » syscall.Dup2(int(fd), stdInFd) | 76 } |
| 93 » syscall.Dup2(int(fd), stdOutFd) | |
| 94 » syscall.Dup2(int(fd), stdErrFd) | |
| 95 | 77 |
| 96 » return pid, nil | 78 type uptimeResult struct { |
| 79 » Uptime time.Duration | |
| 80 » Err error | |
| 97 } | 81 } |
| 98 | 82 |
| 99 // Read from /proc/uptime. Expected format: | 83 // Read from /proc/uptime. Expected format: |
| 100 // "uptime_in_seconds cpu_idle_time_in_seconds" | 84 // "uptime_in_seconds cpu_idle_time_in_seconds" |
| 101 func getDeviceUptime() (time.Duration, error) { | 85 // Return the uptime via a channel for use with timeouts. |
| 86 func getDeviceUptime(c chan uptimeResult) { | |
|
dnj
2016/09/02 16:17:32
Make this a directional channel:
c chan<- uptimeRe
bpastene
2016/09/02 23:02:13
Done.
| |
| 102 bytes, err := ioutil.ReadFile("/proc/uptime") | 87 bytes, err := ioutil.ReadFile("/proc/uptime") |
| 103 if err != nil { | 88 if err != nil { |
| 104 » » return 0, fmt.Errorf("unable to open /proc/uptime: %s", err.Erro r()) | 89 » » c <- uptimeResult{Uptime: 0, Err: fmt.Errorf("unable to open /pr oc/uptime: %s", err.Error())} |
| 90 » » return | |
| 105 } | 91 } |
| 106 // Split on the space to get uptime and drop cpu idle time. | 92 // Split on the space to get uptime and drop cpu idle time. |
| 107 uptimeFields := strings.Fields(string(bytes)) | 93 uptimeFields := strings.Fields(string(bytes)) |
| 108 if len(uptimeFields) == 0 { | 94 if len(uptimeFields) == 0 { |
| 109 » » return 0, fmt.Errorf("unable to parse /proc/uptime") | 95 » » c <- uptimeResult{Uptime: 0, Err: fmt.Errorf("unable to parse /p roc/uptime")} |
| 96 » » return | |
| 110 } | 97 } |
| 111 uptime, err := strconv.ParseFloat(uptimeFields[0], 64) | 98 uptime, err := strconv.ParseFloat(uptimeFields[0], 64) |
| 112 if err != nil { | 99 if err != nil { |
| 113 » » return 0, fmt.Errorf("unable to parse uptime: %s", err.Error()) | 100 » » c <- uptimeResult{Uptime: 0, Err: fmt.Errorf("unable to parse up time: %s", err.Error())} |
| 101 » » return | |
| 114 } | 102 } |
| 115 » return time.Duration(uptime * float64(time.Second)), nil | 103 » c <- uptimeResult{Uptime: time.Duration(uptime * float64(time.Second)), Err: nil} |
| 116 } | 104 } |
| 117 | 105 |
| 118 // Reboot device by writing to sysrq-trigger. See: | 106 // Reboot device by writing to sysrq-trigger. See: |
| 119 // https://www.kernel.org/doc/Documentation/sysrq.txt | 107 // https://www.kernel.org/doc/Documentation/sysrq.txt |
| 120 func rebootDevice() error { | 108 func rebootDevice() error { |
| 121 fd, err := os.OpenFile("/proc/sysrq-trigger", os.O_WRONLY, 0) | 109 fd, err := os.OpenFile("/proc/sysrq-trigger", os.O_WRONLY, 0) |
| 122 if err != nil { | 110 if err != nil { |
| 123 return fmt.Errorf("Can't open /proc/sysrq-trigger: %s", err.Erro r()) | 111 return fmt.Errorf("Can't open /proc/sysrq-trigger: %s", err.Erro r()) |
| 124 } | 112 } |
| 125 defer fd.Close() | 113 defer fd.Close() |
| 126 _, err = fd.Write([]byte("b")) | 114 _, err = fd.Write([]byte("b")) |
| 127 if err != nil { | 115 if err != nil { |
| 128 return fmt.Errorf("Can't reboot: %s", err.Error()) | 116 return fmt.Errorf("Can't reboot: %s", err.Error()) |
| 129 } | 117 } |
| 130 return fmt.Errorf("I just rebooted. How am I still alive?!?\n") | 118 return fmt.Errorf("I just rebooted. How am I still alive?!?\n") |
| 131 } | 119 } |
| 132 | 120 |
| 133 func realMain() int { | 121 func realMain() int { |
| 134 maxUptimeFlag := flag.Int("max-uptime", 120, "Maximum uptime in minutes before a reboot is triggered.") | 122 maxUptimeFlag := flag.Int("max-uptime", 120, "Maximum uptime in minutes before a reboot is triggered.") |
| 123 skipDaemonizeFlag := flag.Bool("skip-daemonize", false, "Skips the daemo nize logic. Otherwise it will spawn a copy daemon and exit.") | |
| 135 flag.Parse() | 124 flag.Parse() |
| 136 | 125 |
| 137 » os.Chdir("/") | 126 » if !*skipDaemonizeFlag { |
| 138 » pid, err := daemonize() | 127 » » pid, err := daemonize(*maxUptimeFlag) |
| 139 » if err != nil { | 128 » » if err != nil { |
| 140 » » logcatLog(logError, "Failed to daemonize: %s", err.Error()) | 129 » » » logcatLog(logError, "Failed to daemonize: %s", err.Error ()) |
| 141 » » return 1 | 130 » » » return 1 |
| 142 » } | 131 » » } |
| 143 » if pid > 0 { | |
| 144 logcatLog(logInfo, "Child spawned with pid %d, exiting parent\n" , pid) | 132 logcatLog(logInfo, "Child spawned with pid %d, exiting parent\n" , pid) |
| 145 return 0 | 133 return 0 |
| 146 } | 134 } |
| 147 | 135 |
| 136 var uptimeRes uptimeResult | |
| 148 maxUptime := time.Duration(*maxUptimeFlag) * time.Minute | 137 maxUptime := time.Duration(*maxUptimeFlag) * time.Minute |
| 138 consecutiveTimeouts := 0 | |
| 139 maxTimeouts := 5 | |
| 149 for { | 140 for { |
| 150 » » uptime, err := getDeviceUptime() | 141 » » c := make(chan uptimeResult, 1) |
| 151 » » if err != nil { | 142 » » go getDeviceUptime(c) |
|
bpastene
2016/09/02 00:05:22
Uses a go routine + select statement to add a time
dnj
2016/09/02 16:17:32
Select is correct, but you launch a new goroutine
bpastene
2016/09/02 23:02:13
Thanks for the suggestion Dan. I added the single
| |
| 152 » » » logcatLog(logError, "Failed to get uptime: %s", err.Erro r()) | 143 » » select { |
| 144 » » case uptimeRes = <-c: | |
| 145 » » » consecutiveTimeouts = 0 | |
| 146 » » case <-time.After(5 * time.Second): | |
| 147 » » » consecutiveTimeouts++ | |
| 148 » » } | |
| 149 » » if consecutiveTimeouts == maxTimeouts { | |
| 150 » » » logcatLog(logError, "%d consective timeouts when fetchin g uptime. Triggering reboot", maxTimeouts) | |
| 151 » » » break | |
| 152 » » } else if consecutiveTimeouts > 0 { | |
| 153 » » » logcatLog(logError, "Timeout when fetching uptime. Sleep ing for 60s and trying again.") | |
| 154 » » » time.Sleep(60 * time.Second) | |
| 155 » » » continue | |
| 156 » » } | |
| 157 » » if uptimeRes.Err != nil { | |
| 158 » » » logcatLog(logError, "Failed to get uptime: %s", uptimeRe s.Err.Error()) | |
| 153 return 1 | 159 return 1 |
| 154 } | 160 } |
| 155 | 161 |
| 156 » » if uptime > maxUptime { | 162 » » if uptimeRes.Uptime > maxUptime { |
| 157 » » » logcatLog(logInfo, "Max uptime exceeded: (%s > %s)\n", u ptime, maxUptime) | 163 » » » logcatLog(logInfo, "Max uptime exceeded: (%s > %s)\n", u ptimeRes.Uptime, maxUptime) |
| 158 break | 164 break |
| 159 } | 165 } |
| 160 » » logcatLog(logInfo, "No need to reboot, uptime < max_uptime: (%s < %s)\n", uptime, maxUptime) | 166 » » logcatLog(logInfo, "No need to reboot, uptime < max_uptime: (%s < %s)\n", uptimeRes.Uptime, maxUptime) |
| 161 » » time.Sleep(maxUptime - uptime + time.Second) | 167 » » time.Sleep(maxUptime - uptimeRes.Uptime + time.Second) |
|
dnj
2016/09/02 16:17:32
(Before the world forgets, add a comment about why
bpastene
2016/09/02 23:02:13
Done.
| |
| 162 } | 168 } |
| 163 » if err = rebootDevice(); err != nil { | 169 » if err := rebootDevice(); err != nil { |
| 164 logcatLog(logError, "Failed to reboot device: %s", err.Error()) | 170 logcatLog(logError, "Failed to reboot device: %s", err.Error()) |
| 165 return 1 | 171 return 1 |
| 166 } | 172 } |
| 167 return 0 | 173 return 0 |
| 168 } | 174 } |
| 169 | 175 |
| 170 func main() { | 176 func main() { |
| 171 paniccatcher.Do(func() { | 177 paniccatcher.Do(func() { |
| 172 os.Exit(realMain()) | 178 os.Exit(realMain()) |
| 173 }, func(p *paniccatcher.Panic) { | 179 }, func(p *paniccatcher.Panic) { |
| 174 logcatLog(logError, "Panic: %s\n%s", p.Reason, p.Stack) | 180 logcatLog(logError, "Panic: %s\n%s", p.Reason, p.Stack) |
| 175 os.Exit(1) | 181 os.Exit(1) |
| 176 }) | 182 }) |
| 177 } | 183 } |
| OLD | NEW |