Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(893)

Side by Side Diff: go/src/infra/tools/device_watchdog/main.go

Issue 2302193002: Change daemonize logic in watchdog and add timeout to file system read. (Closed)
Patch Set: Created 4 years, 3 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « no previous file | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright 2016 The Chromium Authors. All rights reserved. 1 // Copyright 2016 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 // +build android 5 // +build android
6 6
7 // Watchdog daemon for android devices. It will attempt to reboot the device 7 // Watchdog daemon for android devices. It will attempt to reboot the device
8 // if its uptime exceeds a specified maximum. 8 // if its uptime exceeds a specified maximum.
9 package main 9 package main
10 10
11 /* 11 /*
12 #cgo LDFLAGS: -landroid -llog 12 #cgo LDFLAGS: -landroid -llog
13 13
14 #include <android/log.h> 14 #include <android/log.h>
15 #include <string.h> 15 #include <string.h>
16 */ 16 */
17 import "C" 17 import "C"
18 18
19 import ( 19 import (
20 "flag" 20 "flag"
21 "fmt" 21 "fmt"
22 "io/ioutil" 22 "io/ioutil"
23 "os" 23 "os"
24 "os/exec"
24 "strconv" 25 "strconv"
25 "strings" 26 "strings"
26 "syscall"
27 "time" 27 "time"
28 "unsafe" 28 "unsafe"
29 29
30 "github.com/luci/luci-go/common/runtime/paniccatcher" 30 "github.com/luci/luci-go/common/runtime/paniccatcher"
31 ) 31 )
32 32
33 var ( 33 var (
34 logHeader = C.CString("CIT_DeviceWatchdog") 34 logHeader = C.CString("CIT_DeviceWatchdog")
35 ) 35 )
36 36
37 type logLevel int 37 type logLevel int
38 38
39 const ( 39 const (
40 logInfo = iota 40 logInfo = iota
41 logWarning 41 logWarning
42 logError 42 logError
43 ) 43 )
44 44
45 const (
46 stdInFd = 0
47 stdOutFd = 1
48 stdErrFd = 2
49 )
50
51 func (l logLevel) getLogLevel() C.int { 45 func (l logLevel) getLogLevel() C.int {
52 switch l { 46 switch l {
53 case logInfo: 47 case logInfo:
54 return C.ANDROID_LOG_INFO 48 return C.ANDROID_LOG_INFO
55 case logWarning: 49 case logWarning:
56 return C.ANDROID_LOG_WARN 50 return C.ANDROID_LOG_WARN
57 case logError: 51 case logError:
58 return C.ANDROID_LOG_ERROR 52 return C.ANDROID_LOG_ERROR
59 default: 53 default:
60 panic("Unknown log level.") 54 panic("Unknown log level.")
61 } 55 }
62 } 56 }
63 57
64 func logcatLog(level logLevel, format string, args ...interface{}) { 58 func logcatLog(level logLevel, format string, args ...interface{}) {
65 cmsg := C.CString(fmt.Sprintf(format, args...)) 59 cmsg := C.CString(fmt.Sprintf(format, args...))
66 defer C.free(unsafe.Pointer(cmsg)) 60 defer C.free(unsafe.Pointer(cmsg))
67 C.__android_log_write(level.getLogLevel(), logHeader, cmsg) 61 C.__android_log_write(level.getLogLevel(), logHeader, cmsg)
68 } 62 }
69 63
70 // Spawn a child process via fork, create new process group, chdir and 64 // Spawn a child process via exec.
71 // redirect std in and out to /dev/null. 65 func daemonize(maxUptime int) (int, error) {
dnj 2016/09/02 16:17:32 This isn't a full daemonize workflow. It doesn't s
bpastene 2016/09/02 23:02:13 There were problems with threads not correctly get
72 func daemonize() (int, error) { 66 » binary, err := os.Readlink("/proc/self/exe")
73 » ret, _, errno := syscall.Syscall(syscall.SYS_FORK, 0, 0, 0)
74 » pid := int(ret)
75 » if errno != 0 {
76 » » return 0, errno
77 » }
78 » if pid > 0 {
79 » » return pid, nil
80 » }
81
82 » _, err := syscall.Setsid()
83 if err != nil { 67 if err != nil {
84 return 0, err 68 return 0, err
85 } 69 }
86 70 » cmd := exec.Command(binary, "--max-uptime", strconv.Itoa(maxUptime), "-- skip-daemonize")
87 » f, err := os.Open("/dev/null") 71 » err = cmd.Start()
88 if err != nil { 72 if err != nil {
89 return 0, err 73 return 0, err
90 } 74 }
91 » fd := f.Fd() 75 » return cmd.Process.Pid, nil
92 » syscall.Dup2(int(fd), stdInFd) 76 }
93 » syscall.Dup2(int(fd), stdOutFd)
94 » syscall.Dup2(int(fd), stdErrFd)
95 77
96 » return pid, nil 78 type uptimeResult struct {
79 » Uptime time.Duration
80 » Err error
97 } 81 }
98 82
99 // Read from /proc/uptime. Expected format: 83 // Read from /proc/uptime. Expected format:
100 // "uptime_in_seconds cpu_idle_time_in_seconds" 84 // "uptime_in_seconds cpu_idle_time_in_seconds"
101 func getDeviceUptime() (time.Duration, error) { 85 // Return the uptime via a channel for use with timeouts.
86 func getDeviceUptime(c chan uptimeResult) {
dnj 2016/09/02 16:17:32 Make this a directional channel: c chan<- uptimeRe
bpastene 2016/09/02 23:02:13 Done.
102 bytes, err := ioutil.ReadFile("/proc/uptime") 87 bytes, err := ioutil.ReadFile("/proc/uptime")
103 if err != nil { 88 if err != nil {
104 » » return 0, fmt.Errorf("unable to open /proc/uptime: %s", err.Erro r()) 89 » » c <- uptimeResult{Uptime: 0, Err: fmt.Errorf("unable to open /pr oc/uptime: %s", err.Error())}
90 » » return
105 } 91 }
106 // Split on the space to get uptime and drop cpu idle time. 92 // Split on the space to get uptime and drop cpu idle time.
107 uptimeFields := strings.Fields(string(bytes)) 93 uptimeFields := strings.Fields(string(bytes))
108 if len(uptimeFields) == 0 { 94 if len(uptimeFields) == 0 {
109 » » return 0, fmt.Errorf("unable to parse /proc/uptime") 95 » » c <- uptimeResult{Uptime: 0, Err: fmt.Errorf("unable to parse /p roc/uptime")}
96 » » return
110 } 97 }
111 uptime, err := strconv.ParseFloat(uptimeFields[0], 64) 98 uptime, err := strconv.ParseFloat(uptimeFields[0], 64)
112 if err != nil { 99 if err != nil {
113 » » return 0, fmt.Errorf("unable to parse uptime: %s", err.Error()) 100 » » c <- uptimeResult{Uptime: 0, Err: fmt.Errorf("unable to parse up time: %s", err.Error())}
101 » » return
114 } 102 }
115 » return time.Duration(uptime * float64(time.Second)), nil 103 » c <- uptimeResult{Uptime: time.Duration(uptime * float64(time.Second)), Err: nil}
116 } 104 }
117 105
118 // Reboot device by writing to sysrq-trigger. See: 106 // Reboot device by writing to sysrq-trigger. See:
119 // https://www.kernel.org/doc/Documentation/sysrq.txt 107 // https://www.kernel.org/doc/Documentation/sysrq.txt
120 func rebootDevice() error { 108 func rebootDevice() error {
121 fd, err := os.OpenFile("/proc/sysrq-trigger", os.O_WRONLY, 0) 109 fd, err := os.OpenFile("/proc/sysrq-trigger", os.O_WRONLY, 0)
122 if err != nil { 110 if err != nil {
123 return fmt.Errorf("Can't open /proc/sysrq-trigger: %s", err.Erro r()) 111 return fmt.Errorf("Can't open /proc/sysrq-trigger: %s", err.Erro r())
124 } 112 }
125 defer fd.Close() 113 defer fd.Close()
126 _, err = fd.Write([]byte("b")) 114 _, err = fd.Write([]byte("b"))
127 if err != nil { 115 if err != nil {
128 return fmt.Errorf("Can't reboot: %s", err.Error()) 116 return fmt.Errorf("Can't reboot: %s", err.Error())
129 } 117 }
130 return fmt.Errorf("I just rebooted. How am I still alive?!?\n") 118 return fmt.Errorf("I just rebooted. How am I still alive?!?\n")
131 } 119 }
132 120
133 func realMain() int { 121 func realMain() int {
134 maxUptimeFlag := flag.Int("max-uptime", 120, "Maximum uptime in minutes before a reboot is triggered.") 122 maxUptimeFlag := flag.Int("max-uptime", 120, "Maximum uptime in minutes before a reboot is triggered.")
123 skipDaemonizeFlag := flag.Bool("skip-daemonize", false, "Skips the daemo nize logic. Otherwise it will spawn a copy daemon and exit.")
135 flag.Parse() 124 flag.Parse()
136 125
137 » os.Chdir("/") 126 » if !*skipDaemonizeFlag {
138 » pid, err := daemonize() 127 » » pid, err := daemonize(*maxUptimeFlag)
139 » if err != nil { 128 » » if err != nil {
140 » » logcatLog(logError, "Failed to daemonize: %s", err.Error()) 129 » » » logcatLog(logError, "Failed to daemonize: %s", err.Error ())
141 » » return 1 130 » » » return 1
142 » } 131 » » }
143 » if pid > 0 {
144 logcatLog(logInfo, "Child spawned with pid %d, exiting parent\n" , pid) 132 logcatLog(logInfo, "Child spawned with pid %d, exiting parent\n" , pid)
145 return 0 133 return 0
146 } 134 }
147 135
136 var uptimeRes uptimeResult
148 maxUptime := time.Duration(*maxUptimeFlag) * time.Minute 137 maxUptime := time.Duration(*maxUptimeFlag) * time.Minute
138 consecutiveTimeouts := 0
139 maxTimeouts := 5
149 for { 140 for {
150 » » uptime, err := getDeviceUptime() 141 » » c := make(chan uptimeResult, 1)
151 » » if err != nil { 142 » » go getDeviceUptime(c)
bpastene 2016/09/02 00:05:22 Uses a go routine + select statement to add a time
dnj 2016/09/02 16:17:32 Select is correct, but you launch a new goroutine
bpastene 2016/09/02 23:02:13 Thanks for the suggestion Dan. I added the single
152 » » » logcatLog(logError, "Failed to get uptime: %s", err.Erro r()) 143 » » select {
144 » » case uptimeRes = <-c:
145 » » » consecutiveTimeouts = 0
146 » » case <-time.After(5 * time.Second):
147 » » » consecutiveTimeouts++
148 » » }
149 » » if consecutiveTimeouts == maxTimeouts {
150 » » » logcatLog(logError, "%d consective timeouts when fetchin g uptime. Triggering reboot", maxTimeouts)
151 » » » break
152 » » } else if consecutiveTimeouts > 0 {
153 » » » logcatLog(logError, "Timeout when fetching uptime. Sleep ing for 60s and trying again.")
154 » » » time.Sleep(60 * time.Second)
155 » » » continue
156 » » }
157 » » if uptimeRes.Err != nil {
158 » » » logcatLog(logError, "Failed to get uptime: %s", uptimeRe s.Err.Error())
153 return 1 159 return 1
154 } 160 }
155 161
156 » » if uptime > maxUptime { 162 » » if uptimeRes.Uptime > maxUptime {
157 » » » logcatLog(logInfo, "Max uptime exceeded: (%s > %s)\n", u ptime, maxUptime) 163 » » » logcatLog(logInfo, "Max uptime exceeded: (%s > %s)\n", u ptimeRes.Uptime, maxUptime)
158 break 164 break
159 } 165 }
160 » » logcatLog(logInfo, "No need to reboot, uptime < max_uptime: (%s < %s)\n", uptime, maxUptime) 166 » » logcatLog(logInfo, "No need to reboot, uptime < max_uptime: (%s < %s)\n", uptimeRes.Uptime, maxUptime)
161 » » time.Sleep(maxUptime - uptime + time.Second) 167 » » time.Sleep(maxUptime - uptimeRes.Uptime + time.Second)
dnj 2016/09/02 16:17:32 (Before the world forgets, add a comment about why
bpastene 2016/09/02 23:02:13 Done.
162 } 168 }
163 » if err = rebootDevice(); err != nil { 169 » if err := rebootDevice(); err != nil {
164 logcatLog(logError, "Failed to reboot device: %s", err.Error()) 170 logcatLog(logError, "Failed to reboot device: %s", err.Error())
165 return 1 171 return 1
166 } 172 }
167 return 0 173 return 0
168 } 174 }
169 175
170 func main() { 176 func main() {
171 paniccatcher.Do(func() { 177 paniccatcher.Do(func() {
172 os.Exit(realMain()) 178 os.Exit(realMain())
173 }, func(p *paniccatcher.Panic) { 179 }, func(p *paniccatcher.Panic) {
174 logcatLog(logError, "Panic: %s\n%s", p.Reason, p.Stack) 180 logcatLog(logError, "Panic: %s\n%s", p.Reason, p.Stack)
175 os.Exit(1) 181 os.Exit(1)
176 }) 182 })
177 } 183 }
OLDNEW
« no previous file with comments | « no previous file | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698