OLD | NEW |
---|---|
1 // Copyright 2015 The Chromium Authors. All rights reserved. | 1 // Copyright 2015 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 package analyzer | 5 package analyzer |
6 | 6 |
7 import ( | 7 import ( |
8 "errors" | 8 "errors" |
9 "fmt" | 9 "fmt" |
10 "net/url" | 10 "net/url" |
11 "path/filepath" | 11 "path/filepath" |
12 "sort" | 12 "sort" |
13 "strings" | 13 "strings" |
14 "sync" | |
14 "time" | 15 "time" |
15 | 16 |
16 "github.com/Sirupsen/logrus" | 17 "github.com/Sirupsen/logrus" |
17 | 18 |
18 "infra/monitoring/client" | 19 "infra/monitoring/client" |
19 "infra/monitoring/messages" | 20 "infra/monitoring/messages" |
20 ) | 21 ) |
21 | 22 |
22 const ( | 23 const ( |
23 // StepCompletedRun is a synthetic step name used to indicate the build run is complete. | 24 // StepCompletedRun is a synthetic step name used to indicate the build run is complete. |
24 » StepCompletedRun = "completed run" | 25 » StepCompletedRun = "completed run" |
26 » treeCloserPri = 0 | |
27 » staleMasterSev = 0 | |
28 » staleBuilderSev = 0 | |
29 » hungBuilderSev = 1 | |
30 » idleBuilderSev = 1 | |
31 » offlineBuilderSev = 1 | |
25 ) | 32 ) |
26 | 33 |
27 var ( | 34 var ( |
28 log = logrus.New() | 35 log = logrus.New() |
29 ) | 36 ) |
30 | 37 |
31 var ( | 38 var ( |
32 errNoBuildSteps = errors.New("No build steps") | 39 errNoBuildSteps = errors.New("No build steps") |
33 ) | 40 ) |
34 | 41 |
(...skipping 14 matching lines...) Expand all Loading... | |
49 // Reasons lists the reasons for the stepFailure determined by the StepF ailureAnalyzer. | 56 // Reasons lists the reasons for the stepFailure determined by the StepF ailureAnalyzer. |
50 Reasons []string | 57 Reasons []string |
51 } | 58 } |
52 | 59 |
53 // MasterAnalyzer runs the process of checking masters, builders, test results a nd so on, | 60 // MasterAnalyzer runs the process of checking masters, builders, test results a nd so on, |
54 // in order to produce alerts. | 61 // in order to produce alerts. |
55 type MasterAnalyzer struct { | 62 type MasterAnalyzer struct { |
56 // MaxRecentBuilds is the maximum number of recent builds to check, per builder. | 63 // MaxRecentBuilds is the maximum number of recent builds to check, per builder. |
57 MaxRecentBuilds int | 64 MaxRecentBuilds int |
58 | 65 |
66 // MinRecentBuilds is the minimum number of recent builds to check, per builder. | |
67 MinRecentBuilds int | |
68 | |
59 // StepAnalzers are the set of build step failure analyzers to be checke d on | 69 // StepAnalzers are the set of build step failure analyzers to be checke d on |
60 // build step failures. | 70 // build step failures. |
61 StepAnalyzers []StepAnalyzer | 71 StepAnalyzers []StepAnalyzer |
62 | 72 |
63 // Client is the Client implementation for fetching json from CBE, build s, etc. | 73 // Client is the Client implementation for fetching json from CBE, build s, etc. |
64 Client client.Client | 74 Client client.Client |
65 | 75 |
66 // HungBuilerThresh is the maxumum length of time a builder may be in st ate "building" | 76 // HungBuilerThresh is the maxumum length of time a builder may be in st ate "building" |
67 // before triggering a "hung builder" alert. | 77 // before triggering a "hung builder" alert. |
68 HungBuilderThresh time.Duration | 78 HungBuilderThresh time.Duration |
69 | 79 |
70 // OfflineBuilderThresh is the maximum length of time a builder may be i n state "offline" | 80 // OfflineBuilderThresh is the maximum length of time a builder may be i n state "offline" |
71 // before triggering an "offline builder" alert. | 81 // before triggering an "offline builder" alert. |
72 OfflineBuilderThresh time.Duration | 82 OfflineBuilderThresh time.Duration |
73 | 83 |
74 // IdleBuilderCountThresh is the maximum number of builds a builder may have in queue | 84 // IdleBuilderCountThresh is the maximum number of builds a builder may have in queue |
75 // while in the "idle" state before triggering an "idle builder" alert. | 85 // while in the "idle" state before triggering an "idle builder" alert. |
76 IdleBuilderCountThresh int64 | 86 IdleBuilderCountThresh int64 |
77 | 87 |
78 // StaleMasterThreshold is the maximum age that master data from CBE can be before | 88 // StaleMasterThreshold is the maximum age that master data from CBE can be before |
79 // triggering a "stale master" alert. | 89 // triggering a "stale master" alert. |
80 StaleMasterThreshold time.Duration | 90 StaleMasterThreshold time.Duration |
81 | 91 |
82 // bCache is a map of build cache key to Build message. | 92 // bCache is a map of build cache key to Build message. |
83 bCache map[string]*messages.Builds | 93 bCache map[string]*messages.Builds |
94 // bLock protects bCache | |
95 bLock *sync.Mutex | |
96 | |
97 // These limit the scope analysis, useful for debugging. | |
98 TreeOnly string | |
99 MasterOnly string | |
100 BuilderOnly string | |
101 BuildOnly int64 | |
84 | 102 |
85 // now is useful for mocking the system clock in testing. | 103 // now is useful for mocking the system clock in testing. |
86 now func() time.Time | 104 now func() time.Time |
87 } | 105 } |
88 | 106 |
89 // New returns a new Analyzer. If client is nil, it assigns a default implementa tion. | 107 // New returns a new Analyzer. If client is nil, it assigns a default implementa tion. |
90 // maxBuilds is the maximum number of builds to check, per builder. | 108 // maxBuilds is the maximum number of builds to check, per builder. |
91 func New(c client.Client, maxBuilds int) *MasterAnalyzer { | 109 func New(c client.Client, minBuilds, maxBuilds int) *MasterAnalyzer { |
92 if c == nil { | 110 if c == nil { |
93 c = client.New() | 111 c = client.New() |
94 } | 112 } |
95 | 113 |
96 return &MasterAnalyzer{ | 114 return &MasterAnalyzer{ |
97 Client: c, | 115 Client: c, |
98 MaxRecentBuilds: maxBuilds, | 116 MaxRecentBuilds: maxBuilds, |
117 MinRecentBuilds: minBuilds, | |
99 HungBuilderThresh: 3 * time.Hour, | 118 HungBuilderThresh: 3 * time.Hour, |
100 OfflineBuilderThresh: 90 * time.Minute, | 119 OfflineBuilderThresh: 90 * time.Minute, |
101 IdleBuilderCountThresh: 50, | 120 IdleBuilderCountThresh: 50, |
102 StaleMasterThreshold: 10 * time.Minute, | 121 StaleMasterThreshold: 10 * time.Minute, |
103 StepAnalyzers: []StepAnalyzer{ | 122 StepAnalyzers: []StepAnalyzer{ |
104 &TestFailureAnalyzer{Client: c}, | 123 &TestFailureAnalyzer{Client: c}, |
124 &CompileFailureAnalyzer{Client: c}, | |
105 }, | 125 }, |
106 | 126 |
107 now: func() time.Time { | 127 now: func() time.Time { |
108 return time.Now() | 128 return time.Now() |
109 }, | 129 }, |
110 | 130 |
111 bCache: map[string]*messages.Builds{}, | 131 bCache: map[string]*messages.Builds{}, |
132 bLock: &sync.Mutex{}, | |
112 } | 133 } |
113 } | 134 } |
114 | 135 |
115 // MasterAlerts returns alerts generated from the master at URL. | 136 // MasterAlerts returns alerts generated from the master at URL. |
116 func (a *MasterAnalyzer) MasterAlerts(url string, be *messages.BuildExtract) []m essages.Alert { | 137 func (a *MasterAnalyzer) MasterAlerts(url string, be *messages.BuildExtract) []m essages.Alert { |
117 ret := []messages.Alert{} | 138 ret := []messages.Alert{} |
118 | 139 |
119 // Copied logic from builder_messages. | 140 // Copied logic from builder_messages. |
120 // No created_timestamp should be a warning sign, no? | 141 // No created_timestamp should be a warning sign, no? |
121 if be.CreatedTimestamp == messages.EpochTime(0) { | 142 if be.CreatedTimestamp == messages.EpochTime(0) { |
122 return ret | 143 return ret |
123 } | 144 } |
124 | 145 |
125 elapsed := a.now().Sub(be.CreatedTimestamp.Time()) | 146 elapsed := a.now().Sub(be.CreatedTimestamp.Time()) |
126 if elapsed > a.StaleMasterThreshold { | 147 if elapsed > a.StaleMasterThreshold { |
127 ret = append(ret, messages.Alert{ | 148 ret = append(ret, messages.Alert{ |
128 Key: fmt.Sprintf("stale master: %v", url), | 149 Key: fmt.Sprintf("stale master: %v", url), |
129 Title: "Stale Master Data", | 150 Title: "Stale Master Data", |
130 Body: fmt.Sprintf("%s elapsed since last update (%s) .", elapsed, be.CreatedTimestamp.Time()), | 151 Body: fmt.Sprintf("%s elapsed since last update (%s) .", elapsed, be.CreatedTimestamp.Time()), |
131 » » » Severity: 0, | 152 » » » Severity: staleMasterSev, |
132 Time: messages.TimeToEpochTime(a.now()), | 153 Time: messages.TimeToEpochTime(a.now()), |
133 Links: []messages.Link{{"Master", url}}, | 154 Links: []messages.Link{{"Master", url}}, |
134 // No type or extension for now. | 155 // No type or extension for now. |
135 }) | 156 }) |
136 } | 157 } |
137 if elapsed < 0 { | 158 if elapsed < 0 { |
138 // Add this to the alerts returned, rather than just log it? | 159 // Add this to the alerts returned, rather than just log it? |
139 log.Errorf("Master %s timestamp is newer than current time (%s): %s old.", url, a.now(), elapsed) | 160 log.Errorf("Master %s timestamp is newer than current time (%s): %s old.", url, a.now(), elapsed) |
140 } | 161 } |
141 | 162 |
(...skipping 11 matching lines...) Expand all Loading... | |
153 type r struct { | 174 type r struct { |
154 bn string | 175 bn string |
155 b messages.Builders | 176 b messages.Builders |
156 alerts []messages.Alert | 177 alerts []messages.Alert |
157 err []error | 178 err []error |
158 } | 179 } |
159 c := make(chan r, len(be.Builders)) | 180 c := make(chan r, len(be.Builders)) |
160 | 181 |
161 // TODO: get a list of all the running builds from be.Slaves? It | 182 // TODO: get a list of all the running builds from be.Slaves? It |
162 // appears to be used later on in the original py. | 183 // appears to be used later on in the original py. |
184 scannedBuilders := []string{} | |
163 for bn, b := range be.Builders { | 185 for bn, b := range be.Builders { |
186 if a.BuilderOnly != "" && bn != a.BuilderOnly { | |
187 continue | |
188 } | |
189 scannedBuilders = append(scannedBuilders, bn) | |
164 go func(bn string, b messages.Builders) { | 190 go func(bn string, b messages.Builders) { |
165 out := r{bn: bn, b: b} | 191 out := r{bn: bn, b: b} |
166 defer func() { | 192 defer func() { |
167 c <- out | 193 c <- out |
168 }() | 194 }() |
169 | 195 |
170 // This blocks on IO, hence the goroutine. | 196 // This blocks on IO, hence the goroutine. |
171 a.warmBuildCache(mn, bn, b.CachedBuilds) | 197 a.warmBuildCache(mn, bn, b.CachedBuilds) |
172 | |
173 // Each call to builderAlerts may trigger blocking json fetches, | 198 // Each call to builderAlerts may trigger blocking json fetches, |
174 // but it has a data dependency on the above cache-warmi ng call, so | 199 // but it has a data dependency on the above cache-warmi ng call, so |
175 // the logic remains serial. | 200 // the logic remains serial. |
176 out.alerts, out.err = a.builderAlerts(mn, bn, &b) | 201 out.alerts, out.err = a.builderAlerts(mn, bn, &b) |
177 }(bn, b) | 202 }(bn, b) |
178 } | 203 } |
179 | 204 |
180 ret := []messages.Alert{} | 205 ret := []messages.Alert{} |
181 » for bn := range be.Builders { | 206 » for _, bn := range scannedBuilders { |
182 r := <-c | 207 r := <-c |
183 if len(r.err) != 0 { | 208 if len(r.err) != 0 { |
184 // TODO: add a special alert for this too? | 209 // TODO: add a special alert for this too? |
185 log.Errorf("Error getting alerts for builder %s: %v", bn , r.err) | 210 log.Errorf("Error getting alerts for builder %s: %v", bn , r.err) |
186 } else { | 211 } else { |
187 ret = append(ret, r.alerts...) | 212 ret = append(ret, r.alerts...) |
188 } | 213 } |
189 } | 214 } |
190 | 215 |
191 return ret | 216 return ret |
192 } | 217 } |
193 | 218 |
194 // masterName extracts the name of the master from the master's URL. | 219 // masterName extracts the name of the master from the master's URL. |
195 func masterName(URL string) (string, error) { | 220 func masterName(URL string) (string, error) { |
196 mURL, err := url.Parse(URL) | 221 mURL, err := url.Parse(URL) |
197 if err != nil { | 222 if err != nil { |
198 return "", err | 223 return "", err |
199 } | 224 } |
200 pathParts := strings.Split(mURL.Path, "/") | 225 pathParts := strings.Split(mURL.Path, "/") |
201 » return pathParts[len(pathParts)-1], nil | 226 » return pathParts[len(pathParts)-2], nil |
Vadim Sh.
2015/05/06 21:40:18
check that len(pathParts) is >= 2. Also it would b
seanmccullough
2015/05/06 22:16:20
To lop the /json off the http://.../{master.name}/
| |
202 } | 227 } |
203 | 228 |
204 func cacheKeyForBuild(master, builder string, number int64) string { | 229 func cacheKeyForBuild(master, builder string, number int64) string { |
205 return filepath.FromSlash( | 230 return filepath.FromSlash( |
206 fmt.Sprintf("%s/%s/%d.json", url.QueryEscape(master), url.QueryE scape(builder), number)) | 231 fmt.Sprintf("%s/%s/%d.json", url.QueryEscape(master), url.QueryE scape(builder), number)) |
207 } | 232 } |
208 | 233 |
209 // TODO: actually write the on-disk cache. | 234 // TODO: actually write the on-disk cache. |
210 func filenameForCacheKey(cc string) string { | 235 func filenameForCacheKey(cc string) string { |
211 cc = strings.Replace(cc, "/", "_", -1) | 236 cc = strings.Replace(cc, "/", "_", -1) |
(...skipping 10 matching lines...) Expand all Loading... | |
222 v.Add("builder", builder) | 247 v.Add("builder", builder) |
223 | 248 |
224 URL := fmt.Sprintf("https://chrome-build-extract.appspot.com/get_builds? %s", v.Encode()) | 249 URL := fmt.Sprintf("https://chrome-build-extract.appspot.com/get_builds? %s", v.Encode()) |
225 res := struct { | 250 res := struct { |
226 Builds []messages.Builds `json:"builds"` | 251 Builds []messages.Builds `json:"builds"` |
227 }{} | 252 }{} |
228 | 253 |
229 // TODO: add FetchBuilds to the client interface. Take a list of {master , builder} and | 254 // TODO: add FetchBuilds to the client interface. Take a list of {master , builder} and |
230 // return (map[{master, builder}][]Builds, map [{master, builder}]error) | 255 // return (map[{master, builder}][]Builds, map [{master, builder}]error) |
231 // That way we can do all of these in parallel. | 256 // That way we can do all of these in parallel. |
257 | |
232 status, err := a.Client.JSON(URL, &res) | 258 status, err := a.Client.JSON(URL, &res) |
233 if err != nil { | 259 if err != nil { |
234 log.Errorf("Error (%d) fetching %s: %s", status, URL, err) | 260 log.Errorf("Error (%d) fetching %s: %s", status, URL, err) |
235 } | 261 } |
236 | 262 |
263 a.bLock.Lock() | |
237 for _, b := range res.Builds { | 264 for _, b := range res.Builds { |
238 » » a.bCache[cacheKeyForBuild(master, builder, b.Number)] = &b | 265 » » // TODO: consider making res.Builds be []*messages.Builds instea d of []messages.Builds |
266 » » ba := b | |
Vadim Sh.
2015/05/06 21:40:18
IIUC, b is already a copy of whatever is in messag
seanmccullough
2015/05/06 22:16:20
Taking the address of a range var always gives you
Vadim Sh.
2015/05/06 22:30:23
:) Good to know. Go pointers + garbage collection
seanmccullough
2015/05/06 22:43:13
Thankfully there aren't quite as many wtf moments
| |
267 » » a.bCache[cacheKeyForBuild(master, builder, b.Number)] = &ba | |
239 } | 268 } |
269 a.bLock.Unlock() | |
240 } | 270 } |
241 | 271 |
242 // This type is used for sorting build IDs. | 272 // This type is used for sorting build IDs. |
243 type buildIDs []int64 | 273 type buildIDs []int64 |
244 | 274 |
245 func (a buildIDs) Len() int { return len(a) } | 275 func (a buildIDs) Len() int { return len(a) } |
246 func (a buildIDs) Swap(i, j int) { a[i], a[j] = a[j], a[i] } | 276 func (a buildIDs) Swap(i, j int) { a[i], a[j] = a[j], a[i] } |
247 func (a buildIDs) Less(i, j int) bool { return a[i] > a[j] } | 277 func (a buildIDs) Less(i, j int) bool { return a[i] > a[j] } |
248 | 278 |
249 // latestBuildStep returns the latest build step name and update time, and an er ror | 279 // latestBuildStep returns the latest build step name and update time, and an er ror |
(...skipping 33 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
283 recentBuildIDs = recentBuildIDs[:a.MaxRecentBuilds] | 313 recentBuildIDs = recentBuildIDs[:a.MaxRecentBuilds] |
284 } | 314 } |
285 if len(recentBuildIDs) == 0 { | 315 if len(recentBuildIDs) == 0 { |
286 // TODO: Make an alert for this? | 316 // TODO: Make an alert for this? |
287 log.Errorf("No recent builds for %s.%s", mn, bn) | 317 log.Errorf("No recent builds for %s.%s", mn, bn) |
288 return alerts, errs | 318 return alerts, errs |
289 } | 319 } |
290 log.Infof("Checking %d most recent builds for alertable step failures: % s/%s", len(recentBuildIDs), mn, bn) | 320 log.Infof("Checking %d most recent builds for alertable step failures: % s/%s", len(recentBuildIDs), mn, bn) |
291 | 321 |
292 // Check for alertable step failures. | 322 // Check for alertable step failures. |
293 » for _, buildID := range recentBuildIDs { | 323 » stepFailureAlerts := map[string]messages.Alert{} |
324 » firstFailureInstance := map[string]int64{} | |
325 | |
326 » for i, buildID := range recentBuildIDs { | |
294 failures, err := a.stepFailures(mn, bn, buildID) | 327 failures, err := a.stepFailures(mn, bn, buildID) |
295 if err != nil { | 328 if err != nil { |
296 errs = append(errs, err) | 329 errs = append(errs, err) |
297 } | 330 } |
331 if len(failures) == 0 && i > a.MinRecentBuilds { | |
332 // Bail as soon as we find a successful build prior to t he most recent couple of builds. | |
333 break | |
334 } | |
335 | |
298 as, err := a.stepFailureAlerts(failures) | 336 as, err := a.stepFailureAlerts(failures) |
299 if err != nil { | 337 if err != nil { |
300 errs = append(errs, err) | 338 errs = append(errs, err) |
301 } | 339 } |
302 » » alerts = append(alerts, as...) | 340 |
341 » » lastKey := "" | |
342 » » // This probably isn't exactly what we want since multiple build ers may have the same alert firing. | |
343 » » // This just merges build ranges on a per-builder basis. | |
344 » » for _, alr := range as { | |
345 » » » // Only keep the most recent alert. Since recentBuildID s is | |
346 » » » // sorted newest build first, ignore older instances of the alert. | |
347 » » » if _, ok := stepFailureAlerts[alr.Key]; !ok { | |
348 » » » » stepFailureAlerts[alr.Key] = alr | |
349 » » » } | |
350 » » » if alr.Key == lastKey || lastKey == "" { | |
351 » » » » firstFailureInstance[alr.Key] = alr.Extension.(m essages.BuildFailure).Builders[0].FirstFailure | |
Vadim Sh.
2015/05/06 21:40:18
This type assertion looks rather magical... Perhap
seanmccullough
2015/05/06 22:16:20
Added a check and log an error if it fails. Not su
| |
352 » » » } | |
353 » » » lastKey = alr.Key | |
354 » » } | |
303 } | 355 } |
304 | 356 |
305 » // Check for stale builders. Latest build is the first in the list. | 357 » for _, alr := range stepFailureAlerts { |
358 » » alr.Extension.(messages.BuildFailure).Builders[0].FirstFailure = firstFailureInstance[alr.Key] | |
359 » » alerts = append(alerts, alr) | |
360 » } | |
361 | |
362 » // Check for stale/idle/offline builders. Latest build is the first in the list. | |
306 lastBuildID := recentBuildIDs[0] | 363 lastBuildID := recentBuildIDs[0] |
307 log.Infof("Checking last build ID: %d", lastBuildID) | 364 log.Infof("Checking last build ID: %d", lastBuildID) |
365 | |
308 // TODO: get this from cache. | 366 // TODO: get this from cache. |
309 » lastBuild, err := a.Client.Build(mn, bn, lastBuildID) | 367 » var lastBuild *messages.Builds |
310 » if err != nil { | 368 » a.bLock.Lock() |
311 » » errs = append(errs, fmt.Errorf("Couldn't get latest build %d for %s.%s: %s", lastBuildID, mn, bn, err)) | 369 » lastBuild = a.bCache[cacheKeyForBuild(mn, bn, lastBuildID)] |
Vadim Sh.
2015/05/06 21:40:18
:= and remove var above
seanmccullough
2015/05/06 22:16:20
Done.
| |
312 » » return alerts, errs | 370 » a.bLock.Unlock() |
371 » if lastBuild == nil { | |
372 » » var err error | |
373 » » lastBuild, err = a.Client.Build(mn, bn, lastBuildID) | |
374 » » if err != nil { | |
375 » » » errs = append(errs, fmt.Errorf("Couldn't get latest buil d %d for %s.%s: %s", lastBuildID, mn, bn, err)) | |
376 » » » return alerts, errs | |
377 » » } | |
313 } | 378 } |
314 | 379 |
315 // Examining only the latest build is probably suboptimal since if it's still in progress it might | 380 // Examining only the latest build is probably suboptimal since if it's still in progress it might |
316 // not have hit a step that is going to fail and has failed repeatedly f or the last few builds. | 381 // not have hit a step that is going to fail and has failed repeatedly f or the last few builds. |
317 // AKA "Reliable failures". TODO: Identify "Reliable failures" | 382 // AKA "Reliable failures". TODO: Identify "Reliable failures" |
318 lastStep, lastUpdated, err := a.latestBuildStep(lastBuild) | 383 lastStep, lastUpdated, err := a.latestBuildStep(lastBuild) |
319 if err != nil { | 384 if err != nil { |
320 errs = append(errs, fmt.Errorf("Couldn't get latest build step f or %s.%s: %v", mn, bn, err)) | 385 errs = append(errs, fmt.Errorf("Couldn't get latest build step f or %s.%s: %v", mn, bn, err)) |
321 return alerts, errs | 386 return alerts, errs |
322 | 387 |
323 } | 388 } |
324 elapsed := a.now().Sub(lastUpdated.Time()) | 389 elapsed := a.now().Sub(lastUpdated.Time()) |
325 links := []messages.Link{ | 390 links := []messages.Link{ |
326 {"Builder", fmt.Sprintf("https://build.chromium.org/p/%s/builder s/%s", mn, bn)}, | 391 {"Builder", fmt.Sprintf("https://build.chromium.org/p/%s/builder s/%s", mn, bn)}, |
327 {"Last build", fmt.Sprintf("https://build.chromium.org/p/%s/buil ders/%s/builds/%d", mn, bn, lastBuildID)}, | 392 {"Last build", fmt.Sprintf("https://build.chromium.org/p/%s/buil ders/%s/builds/%d", mn, bn, lastBuildID)}, |
328 {"Last build step", fmt.Sprintf("https://build.chromium.org/p/%s /builders/%s/builds/%d/steps/%s", mn, bn, lastBuildID, lastStep)}, | 393 {"Last build step", fmt.Sprintf("https://build.chromium.org/p/%s /builders/%s/builds/%d/steps/%s", mn, bn, lastBuildID, lastStep)}, |
329 } | 394 } |
330 | 395 |
331 switch b.State { | 396 switch b.State { |
332 case messages.StateBuilding: | 397 case messages.StateBuilding: |
333 if elapsed > a.HungBuilderThresh && lastStep != StepCompletedRun { | 398 if elapsed > a.HungBuilderThresh && lastStep != StepCompletedRun { |
334 alerts = append(alerts, messages.Alert{ | 399 alerts = append(alerts, messages.Alert{ |
335 Key: fmt.Sprintf("%s.%s.hung", mn, bn), | 400 Key: fmt.Sprintf("%s.%s.hung", mn, bn), |
336 Title: fmt.Sprintf("%s.%s is hung in step %s. ", mn, bn, lastStep), | 401 Title: fmt.Sprintf("%s.%s is hung in step %s. ", mn, bn, lastStep), |
337 Body: fmt.Sprintf("%s.%s has been building f or %v (last step update %s), past the alerting threshold of %v", mn, bn, elapsed , lastUpdated.Time(), a.HungBuilderThresh), | 402 Body: fmt.Sprintf("%s.%s has been building f or %v (last step update %s), past the alerting threshold of %v", mn, bn, elapsed , lastUpdated.Time(), a.HungBuilderThresh), |
338 » » » » Severity: 0, | 403 » » » » Severity: hungBuilderSev, |
339 Time: messages.TimeToEpochTime(a.now()), | 404 Time: messages.TimeToEpochTime(a.now()), |
340 Links: links, | 405 Links: links, |
341 }) | 406 }) |
342 // Note, just because it's building doesn't mean it's in a good state. If the last N builds | 407 // Note, just because it's building doesn't mean it's in a good state. If the last N builds |
343 // all failed (for some large N) then this might still b e alertable. | 408 // all failed (for some large N) then this might still b e alertable. |
344 } | 409 } |
345 case messages.StateOffline: | 410 case messages.StateOffline: |
346 if elapsed > a.OfflineBuilderThresh { | 411 if elapsed > a.OfflineBuilderThresh { |
347 alerts = append(alerts, messages.Alert{ | 412 alerts = append(alerts, messages.Alert{ |
348 Key: fmt.Sprintf("%s.%s.offline", mn, bn), | 413 Key: fmt.Sprintf("%s.%s.offline", mn, bn), |
349 Title: fmt.Sprintf("%s.%s is offline.", mn, b n), | 414 Title: fmt.Sprintf("%s.%s is offline.", mn, b n), |
350 » » » » Body: fmt.Sprintf("%s.%s has been offline fo r %v (last step update %s), past the alerting threshold of %v", mn, bn, elapsed, lastUpdated.Time(), a.OfflineBuilderThresh), | 415 » » » » Body: fmt.Sprintf("%s.%s has been offline fo r %v (last step update %s %v), past the alerting threshold of %v", mn, bn, elaps ed, lastUpdated.Time(), float64(lastUpdated), a.OfflineBuilderThresh), |
351 » » » » Severity: 0, | 416 » » » » Severity: offlineBuilderSev, |
352 Time: messages.TimeToEpochTime(a.now()), | 417 Time: messages.TimeToEpochTime(a.now()), |
353 Links: links, | 418 Links: links, |
354 }) | 419 }) |
355 } | 420 } |
356 case messages.StateIdle: | 421 case messages.StateIdle: |
357 if b.PendingBuilds > a.IdleBuilderCountThresh { | 422 if b.PendingBuilds > a.IdleBuilderCountThresh { |
358 alerts = append(alerts, messages.Alert{ | 423 alerts = append(alerts, messages.Alert{ |
359 Key: fmt.Sprintf("%s.%s.idle", mn, bn), | 424 Key: fmt.Sprintf("%s.%s.idle", mn, bn), |
360 Title: fmt.Sprintf("%s.%s is idle with too ma ny pending builds.", mn, bn), | 425 Title: fmt.Sprintf("%s.%s is idle with too ma ny pending builds.", mn, bn), |
361 Body: fmt.Sprintf("%s.%s is idle with %d pen ding builds, past the alerting threshold of %d", mn, bn, b.PendingBuilds, a.Idle BuilderCountThresh), | 426 Body: fmt.Sprintf("%s.%s is idle with %d pen ding builds, past the alerting threshold of %d", mn, bn, b.PendingBuilds, a.Idle BuilderCountThresh), |
362 » » » » Severity: 0, | 427 » » » » Severity: idleBuilderSev, |
363 Time: messages.TimeToEpochTime(a.now()), | 428 Time: messages.TimeToEpochTime(a.now()), |
364 Links: links, | 429 Links: links, |
365 }) | 430 }) |
366 } | 431 } |
367 default: | 432 default: |
368 log.Errorf("Unknown %s.%s builder state: %s", mn, bn, b.State) | 433 log.Errorf("Unknown %s.%s builder state: %s", mn, bn, b.State) |
369 } | 434 } |
370 | 435 |
371 return alerts, errs | 436 return alerts, errs |
372 } | 437 } |
373 | 438 |
374 // stepFailures returns the steps that have failed recently on builder bn. | 439 // stepFailures returns the steps that have failed recently on builder bn. |
375 func (a *MasterAnalyzer) stepFailures(mn string, bn string, bID int64) ([]stepFa ilure, error) { | 440 func (a *MasterAnalyzer) stepFailures(mn string, bn string, bID int64) ([]stepFa ilure, error) { |
376 cc := cacheKeyForBuild(mn, bn, bID) | 441 cc := cacheKeyForBuild(mn, bn, bID) |
377 | 442 |
378 var err error // To avoid re-scoping b in the nested conditional below w ith a :=. | 443 var err error // To avoid re-scoping b in the nested conditional below w ith a :=. |
444 a.bLock.Lock() | |
379 b, ok := a.bCache[cc] | 445 b, ok := a.bCache[cc] |
446 a.bLock.Unlock() | |
380 if !ok { | 447 if !ok { |
381 log.Infof("Cache miss for %s", cc) | 448 log.Infof("Cache miss for %s", cc) |
382 b, err = a.Client.Build(mn, bn, bID) | 449 b, err = a.Client.Build(mn, bn, bID) |
383 if err != nil || b == nil { | 450 if err != nil || b == nil { |
384 log.Errorf("Error fetching build: %v", err) | 451 log.Errorf("Error fetching build: %v", err) |
385 return nil, err | 452 return nil, err |
386 } | 453 } |
387 } | 454 } |
388 | 455 |
389 ret := []stepFailure{} | 456 ret := []stepFailure{} |
457 | |
390 for _, s := range b.Steps { | 458 for _, s := range b.Steps { |
391 if !s.IsFinished || len(s.Results) == 0 { | 459 if !s.IsFinished || len(s.Results) == 0 { |
392 continue | 460 continue |
393 } | 461 } |
394 // Because Results in the json data is a homogeneous array, the unmarshaler | 462 // Because Results in the json data is a homogeneous array, the unmarshaler |
395 // doesn't have any type information to assert about it. We have to do | 463 // doesn't have any type information to assert about it. We have to do |
396 // some ugly runtime type assertion ourselves. | 464 // some ugly runtime type assertion ourselves. |
397 if r, ok := s.Results[0].(float64); ok { | 465 if r, ok := s.Results[0].(float64); ok { |
398 if r == 0 || r == 1 { | 466 if r == 0 || r == 1 { |
399 // This 0/1 check seems to be a convention or he uristic. A 0 or 1 | 467 // This 0/1 check seems to be a convention or he uristic. A 0 or 1 |
(...skipping 22 matching lines...) Expand all Loading... | |
422 ret := []messages.Alert{} | 490 ret := []messages.Alert{} |
423 type res struct { | 491 type res struct { |
424 f stepFailure | 492 f stepFailure |
425 a *messages.Alert | 493 a *messages.Alert |
426 err error | 494 err error |
427 } | 495 } |
428 | 496 |
429 // Might not need full capacity buffer, since some failures are ignored below. | 497 // Might not need full capacity buffer, since some failures are ignored below. |
430 rs := make(chan res, len(failures)) | 498 rs := make(chan res, len(failures)) |
431 | 499 |
500 scannedFailures := []stepFailure{} | |
432 for _, f := range failures { | 501 for _, f := range failures { |
433 // goroutine/channel because the reasonsForFailure call potentia lly | 502 // goroutine/channel because the reasonsForFailure call potentia lly |
434 // blocks on IO. | 503 // blocks on IO. |
504 if f.step.Name == "steps" { | |
505 continue | |
506 // The actual breaking step will appear later. | |
507 } | |
508 scannedFailures = append(scannedFailures, f) | |
435 go func(f stepFailure) { | 509 go func(f stepFailure) { |
436 alr := messages.Alert{ | 510 alr := messages.Alert{ |
437 Title: fmt.Sprintf("Builder step failure: %s.%s" , f.masterName, f.builderName), | 511 Title: fmt.Sprintf("Builder step failure: %s.%s" , f.masterName, f.builderName), |
438 Time: messages.EpochTime(a.now().Unix()), | 512 Time: messages.EpochTime(a.now().Unix()), |
439 Type: "buildfailure", | 513 Type: "buildfailure", |
440 } | 514 } |
441 | 515 |
516 // If the builder has been failing on the same step for multiple builds in a row, | |
517 // we should have only one alert but indicate the range of builds affected. | |
518 // These are set in FirstFailure and LastFailure. | |
442 bf := messages.BuildFailure{ | 519 bf := messages.BuildFailure{ |
443 // FIXME: group builders? | 520 // FIXME: group builders? |
444 Builders: []messages.AlertedBuilder{ | 521 Builders: []messages.AlertedBuilder{ |
445 { | 522 { |
446 Name: f.builderName, | 523 Name: f.builderName, |
447 » » » » » » URL: f.URL(), | 524 » » » » » » URL: fmt.Sprintf("http s://build.chromium.org/p/%s/builders/%s", f.masterName, f.builderName), |
448 » » » » » » FirstFailure: 0, | 525 » » » » » » FirstFailure: f.build.Number, |
449 » » » » » » LatestFailure: 1, | 526 » » » » » » LatestFailure: f.build.Number, |
450 }, | 527 }, |
451 }, | 528 }, |
452 // TODO: RegressionRanges: | 529 // TODO: RegressionRanges: |
453 // look into Builds.SourceStamp.Changes. | 530 // look into Builds.SourceStamp.Changes. |
454 } | 531 } |
455 | 532 |
456 reasons := a.reasonsForFailure(f) | 533 reasons := a.reasonsForFailure(f) |
457 for _, r := range reasons { | 534 for _, r := range reasons { |
458 bf.Reasons = append(bf.Reasons, messages.Reason{ | 535 bf.Reasons = append(bf.Reasons, messages.Reason{ |
459 TestName: r, | 536 TestName: r, |
460 Step: f.step.Name, | 537 Step: f.step.Name, |
538 URL: f.URL(), | |
461 }) | 539 }) |
462 } | 540 } |
463 | 541 |
464 alr.Extension = bf | 542 alr.Extension = bf |
465 if len(bf.Reasons) == 0 { | 543 if len(bf.Reasons) == 0 { |
466 » » » » log.Warnf("No reasons for step failure: %s", ale rtKey(f.masterName, f.builderName, f.step.Name, "")) | 544 » » » » alr.Key = alertKey(f.masterName, f.builderName, f.step.Name, "") |
467 » » » » rs <- res{ | 545 » » » » log.Warnf("No reasons for step failure: %s", alr .Key) |
468 » » » » » f: f, | |
469 » » » » } | |
470 } else { | 546 } else { |
471 // Should the key include all of the reasons? | 547 // Should the key include all of the reasons? |
472 alr.Key = alertKey(f.masterName, f.builderName, f.step.Name, reasons[0]) | 548 alr.Key = alertKey(f.masterName, f.builderName, f.step.Name, reasons[0]) |
473 | 549 » » » } |
474 » » » » rs <- res{ | 550 » » » rs <- res{ |
475 » » » » » f: f, | 551 » » » » f: f, |
476 » » » » » a: &alr, | 552 » » » » a: &alr, |
477 » » » » » err: nil, | 553 » » » » err: nil, |
478 » » » » } | |
479 } | 554 } |
480 }(f) | 555 }(f) |
481 } | 556 } |
482 | 557 |
483 » for _ = range failures { | 558 » for _ = range scannedFailures { |
484 r := <-rs | 559 r := <-rs |
485 if r.a != nil { | 560 if r.a != nil { |
486 ret = append(ret, *r.a) | 561 ret = append(ret, *r.a) |
487 } | 562 } |
488 } | 563 } |
489 | 564 |
490 return ret, nil | 565 return ret, nil |
491 } | 566 } |
492 | 567 |
493 // reasonsForFailure examines the step failure and applies some heuristics to | 568 // reasonsForFailure examines the step failure and applies some heuristics to |
494 // to find the cause. It may make blocking IO calls in the process. | 569 // to find the cause. It may make blocking IO calls in the process. |
495 func (a *MasterAnalyzer) reasonsForFailure(f stepFailure) []string { | 570 func (a *MasterAnalyzer) reasonsForFailure(f stepFailure) []string { |
496 ret := []string{} | 571 ret := []string{} |
497 recognized := false | 572 recognized := false |
498 log.Infof("Checking for reasons for failure step: %v", f.step.Name) | 573 log.Infof("Checking for reasons for failure step: %v", f.step.Name) |
499 for _, sfa := range a.StepAnalyzers { | 574 for _, sfa := range a.StepAnalyzers { |
500 res, err := sfa.Analyze(f) | 575 res, err := sfa.Analyze(f) |
501 if err != nil { | 576 if err != nil { |
577 // TODO: return something that contains errors *and* rea sons. | |
502 log.Errorf("Error get reasons from StepAnalyzer: %v", er r) | 578 log.Errorf("Error get reasons from StepAnalyzer: %v", er r) |
579 continue | |
503 } | 580 } |
504 if res.Recognized { | 581 if res.Recognized { |
505 recognized = true | 582 recognized = true |
506 ret = append(ret, res.Reasons...) | 583 ret = append(ret, res.Reasons...) |
507 } | 584 } |
508 } | 585 } |
509 | 586 |
510 if !recognized { | 587 if !recognized { |
511 // TODO: log and report frequently encountered unrecognized buil der step | 588 // TODO: log and report frequently encountered unrecognized buil der step |
512 // failure names. | 589 // failure names. |
(...skipping 39 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
552 // Sigh. build.chromium.org doesn't accept + as an escaped space in URL paths. | 629 // Sigh. build.chromium.org doesn't accept + as an escaped space in URL paths. |
553 func oldEscape(s string) string { | 630 func oldEscape(s string) string { |
554 return strings.Replace(url.QueryEscape(s), "+", "%20", -1) | 631 return strings.Replace(url.QueryEscape(s), "+", "%20", -1) |
555 } | 632 } |
556 | 633 |
557 // URL returns a url to builder step failure page. | 634 // URL returns a url to builder step failure page. |
558 func (f stepFailure) URL() string { | 635 func (f stepFailure) URL() string { |
559 return fmt.Sprintf("https://build.chromium.org/p/%s/builders/%s/builds/% d/steps/%s", | 636 return fmt.Sprintf("https://build.chromium.org/p/%s/builders/%s/builds/% d/steps/%s", |
560 f.masterName, oldEscape(f.builderName), f.build.Number, oldEscap e(f.step.Name)) | 637 f.masterName, oldEscape(f.builderName), f.build.Number, oldEscap e(f.step.Name)) |
561 } | 638 } |
OLD | NEW |