OLD | NEW |
1 // Copyright 2015 The Chromium Authors. All rights reserved. | 1 // Copyright 2015 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 package analyzer | 5 package analyzer |
6 | 6 |
7 import ( | 7 import ( |
8 "errors" | 8 "errors" |
9 "fmt" | 9 "fmt" |
10 "net/url" | 10 "net/url" |
11 "path/filepath" | 11 "path/filepath" |
12 "sort" | 12 "sort" |
13 "strings" | 13 "strings" |
| 14 "sync" |
14 "time" | 15 "time" |
15 | 16 |
16 "github.com/Sirupsen/logrus" | 17 "github.com/Sirupsen/logrus" |
17 | 18 |
18 "infra/monitoring/client" | 19 "infra/monitoring/client" |
19 "infra/monitoring/messages" | 20 "infra/monitoring/messages" |
20 ) | 21 ) |
21 | 22 |
22 const ( | 23 const ( |
23 // StepCompletedRun is a synthetic step name used to indicate the build
run is complete. | 24 // StepCompletedRun is a synthetic step name used to indicate the build
run is complete. |
24 » StepCompletedRun = "completed run" | 25 » StepCompletedRun = "completed run" |
| 26 » treeCloserPri = 0 |
| 27 » staleMasterSev = 0 |
| 28 » staleBuilderSev = 0 |
| 29 » hungBuilderSev = 1 |
| 30 » idleBuilderSev = 1 |
| 31 » offlineBuilderSev = 1 |
25 ) | 32 ) |
26 | 33 |
27 var ( | 34 var ( |
28 log = logrus.New() | 35 log = logrus.New() |
29 ) | 36 ) |
30 | 37 |
31 var ( | 38 var ( |
32 errNoBuildSteps = errors.New("No build steps") | 39 errNoBuildSteps = errors.New("No build steps") |
33 ) | 40 ) |
34 | 41 |
(...skipping 14 matching lines...) Expand all Loading... |
49 // Reasons lists the reasons for the stepFailure determined by the StepF
ailureAnalyzer. | 56 // Reasons lists the reasons for the stepFailure determined by the StepF
ailureAnalyzer. |
50 Reasons []string | 57 Reasons []string |
51 } | 58 } |
52 | 59 |
53 // MasterAnalyzer runs the process of checking masters, builders, test results a
nd so on, | 60 // MasterAnalyzer runs the process of checking masters, builders, test results a
nd so on, |
54 // in order to produce alerts. | 61 // in order to produce alerts. |
55 type MasterAnalyzer struct { | 62 type MasterAnalyzer struct { |
56 // MaxRecentBuilds is the maximum number of recent builds to check, per
builder. | 63 // MaxRecentBuilds is the maximum number of recent builds to check, per
builder. |
57 MaxRecentBuilds int | 64 MaxRecentBuilds int |
58 | 65 |
| 66 // MinRecentBuilds is the minimum number of recent builds to check, per
builder. |
| 67 MinRecentBuilds int |
| 68 |
59 // StepAnalzers are the set of build step failure analyzers to be checke
d on | 69 // StepAnalzers are the set of build step failure analyzers to be checke
d on |
60 // build step failures. | 70 // build step failures. |
61 StepAnalyzers []StepAnalyzer | 71 StepAnalyzers []StepAnalyzer |
62 | 72 |
63 // Client is the Client implementation for fetching json from CBE, build
s, etc. | 73 // Client is the Client implementation for fetching json from CBE, build
s, etc. |
64 Client client.Client | 74 Client client.Client |
65 | 75 |
66 // HungBuilerThresh is the maxumum length of time a builder may be in st
ate "building" | 76 // HungBuilerThresh is the maxumum length of time a builder may be in st
ate "building" |
67 // before triggering a "hung builder" alert. | 77 // before triggering a "hung builder" alert. |
68 HungBuilderThresh time.Duration | 78 HungBuilderThresh time.Duration |
69 | 79 |
70 // OfflineBuilderThresh is the maximum length of time a builder may be i
n state "offline" | 80 // OfflineBuilderThresh is the maximum length of time a builder may be i
n state "offline" |
71 // before triggering an "offline builder" alert. | 81 // before triggering an "offline builder" alert. |
72 OfflineBuilderThresh time.Duration | 82 OfflineBuilderThresh time.Duration |
73 | 83 |
74 // IdleBuilderCountThresh is the maximum number of builds a builder may
have in queue | 84 // IdleBuilderCountThresh is the maximum number of builds a builder may
have in queue |
75 // while in the "idle" state before triggering an "idle builder" alert. | 85 // while in the "idle" state before triggering an "idle builder" alert. |
76 IdleBuilderCountThresh int64 | 86 IdleBuilderCountThresh int64 |
77 | 87 |
78 // StaleMasterThreshold is the maximum age that master data from CBE can
be before | 88 // StaleMasterThreshold is the maximum age that master data from CBE can
be before |
79 // triggering a "stale master" alert. | 89 // triggering a "stale master" alert. |
80 StaleMasterThreshold time.Duration | 90 StaleMasterThreshold time.Duration |
81 | 91 |
82 // bCache is a map of build cache key to Build message. | 92 // bCache is a map of build cache key to Build message. |
83 bCache map[string]*messages.Builds | 93 bCache map[string]*messages.Builds |
| 94 // bLock protects bCache |
| 95 bLock *sync.Mutex |
| 96 |
| 97 // These limit the scope analysis, useful for debugging. |
| 98 TreeOnly string |
| 99 MasterOnly string |
| 100 BuilderOnly string |
| 101 BuildOnly int64 |
84 | 102 |
85 // now is useful for mocking the system clock in testing. | 103 // now is useful for mocking the system clock in testing. |
86 now func() time.Time | 104 now func() time.Time |
87 } | 105 } |
88 | 106 |
89 // New returns a new Analyzer. If client is nil, it assigns a default implementa
tion. | 107 // New returns a new Analyzer. If client is nil, it assigns a default implementa
tion. |
90 // maxBuilds is the maximum number of builds to check, per builder. | 108 // maxBuilds is the maximum number of builds to check, per builder. |
91 func New(c client.Client, maxBuilds int) *MasterAnalyzer { | 109 func New(c client.Client, minBuilds, maxBuilds int) *MasterAnalyzer { |
92 if c == nil { | 110 if c == nil { |
93 c = client.New() | 111 c = client.New() |
94 } | 112 } |
95 | 113 |
96 return &MasterAnalyzer{ | 114 return &MasterAnalyzer{ |
97 Client: c, | 115 Client: c, |
98 MaxRecentBuilds: maxBuilds, | 116 MaxRecentBuilds: maxBuilds, |
| 117 MinRecentBuilds: minBuilds, |
99 HungBuilderThresh: 3 * time.Hour, | 118 HungBuilderThresh: 3 * time.Hour, |
100 OfflineBuilderThresh: 90 * time.Minute, | 119 OfflineBuilderThresh: 90 * time.Minute, |
101 IdleBuilderCountThresh: 50, | 120 IdleBuilderCountThresh: 50, |
102 StaleMasterThreshold: 10 * time.Minute, | 121 StaleMasterThreshold: 10 * time.Minute, |
103 StepAnalyzers: []StepAnalyzer{ | 122 StepAnalyzers: []StepAnalyzer{ |
104 &TestFailureAnalyzer{Client: c}, | 123 &TestFailureAnalyzer{Client: c}, |
| 124 &CompileFailureAnalyzer{Client: c}, |
105 }, | 125 }, |
106 | 126 |
107 now: func() time.Time { | 127 now: func() time.Time { |
108 return time.Now() | 128 return time.Now() |
109 }, | 129 }, |
110 | 130 |
111 bCache: map[string]*messages.Builds{}, | 131 bCache: map[string]*messages.Builds{}, |
| 132 bLock: &sync.Mutex{}, |
112 } | 133 } |
113 } | 134 } |
114 | 135 |
115 // MasterAlerts returns alerts generated from the master at URL. | 136 // MasterAlerts returns alerts generated from the master at URL. |
116 func (a *MasterAnalyzer) MasterAlerts(url string, be *messages.BuildExtract) []m
essages.Alert { | 137 func (a *MasterAnalyzer) MasterAlerts(url string, be *messages.BuildExtract) []m
essages.Alert { |
117 ret := []messages.Alert{} | 138 ret := []messages.Alert{} |
118 | 139 |
119 // Copied logic from builder_messages. | 140 // Copied logic from builder_messages. |
120 // No created_timestamp should be a warning sign, no? | 141 // No created_timestamp should be a warning sign, no? |
121 if be.CreatedTimestamp == messages.EpochTime(0) { | 142 if be.CreatedTimestamp == messages.EpochTime(0) { |
122 return ret | 143 return ret |
123 } | 144 } |
124 | 145 |
125 elapsed := a.now().Sub(be.CreatedTimestamp.Time()) | 146 elapsed := a.now().Sub(be.CreatedTimestamp.Time()) |
126 if elapsed > a.StaleMasterThreshold { | 147 if elapsed > a.StaleMasterThreshold { |
127 ret = append(ret, messages.Alert{ | 148 ret = append(ret, messages.Alert{ |
128 Key: fmt.Sprintf("stale master: %v", url), | 149 Key: fmt.Sprintf("stale master: %v", url), |
129 Title: "Stale Master Data", | 150 Title: "Stale Master Data", |
130 Body: fmt.Sprintf("%s elapsed since last update (%s)
.", elapsed, be.CreatedTimestamp.Time()), | 151 Body: fmt.Sprintf("%s elapsed since last update (%s)
.", elapsed, be.CreatedTimestamp.Time()), |
131 » » » Severity: 0, | 152 » » » Severity: staleMasterSev, |
132 Time: messages.TimeToEpochTime(a.now()), | 153 Time: messages.TimeToEpochTime(a.now()), |
133 Links: []messages.Link{{"Master", url}}, | 154 Links: []messages.Link{{"Master", url}}, |
134 // No type or extension for now. | 155 // No type or extension for now. |
135 }) | 156 }) |
136 } | 157 } |
137 if elapsed < 0 { | 158 if elapsed < 0 { |
138 // Add this to the alerts returned, rather than just log it? | 159 // Add this to the alerts returned, rather than just log it? |
139 log.Errorf("Master %s timestamp is newer than current time (%s):
%s old.", url, a.now(), elapsed) | 160 log.Errorf("Master %s timestamp is newer than current time (%s):
%s old.", url, a.now(), elapsed) |
140 } | 161 } |
141 | 162 |
(...skipping 11 matching lines...) Expand all Loading... |
153 type r struct { | 174 type r struct { |
154 bn string | 175 bn string |
155 b messages.Builders | 176 b messages.Builders |
156 alerts []messages.Alert | 177 alerts []messages.Alert |
157 err []error | 178 err []error |
158 } | 179 } |
159 c := make(chan r, len(be.Builders)) | 180 c := make(chan r, len(be.Builders)) |
160 | 181 |
161 // TODO: get a list of all the running builds from be.Slaves? It | 182 // TODO: get a list of all the running builds from be.Slaves? It |
162 // appears to be used later on in the original py. | 183 // appears to be used later on in the original py. |
| 184 scannedBuilders := []string{} |
163 for bn, b := range be.Builders { | 185 for bn, b := range be.Builders { |
| 186 if a.BuilderOnly != "" && bn != a.BuilderOnly { |
| 187 continue |
| 188 } |
| 189 scannedBuilders = append(scannedBuilders, bn) |
164 go func(bn string, b messages.Builders) { | 190 go func(bn string, b messages.Builders) { |
165 out := r{bn: bn, b: b} | 191 out := r{bn: bn, b: b} |
166 defer func() { | 192 defer func() { |
167 c <- out | 193 c <- out |
168 }() | 194 }() |
169 | 195 |
170 // This blocks on IO, hence the goroutine. | 196 // This blocks on IO, hence the goroutine. |
171 a.warmBuildCache(mn, bn, b.CachedBuilds) | 197 a.warmBuildCache(mn, bn, b.CachedBuilds) |
172 | |
173 // Each call to builderAlerts may trigger blocking json
fetches, | 198 // Each call to builderAlerts may trigger blocking json
fetches, |
174 // but it has a data dependency on the above cache-warmi
ng call, so | 199 // but it has a data dependency on the above cache-warmi
ng call, so |
175 // the logic remains serial. | 200 // the logic remains serial. |
176 out.alerts, out.err = a.builderAlerts(mn, bn, &b) | 201 out.alerts, out.err = a.builderAlerts(mn, bn, &b) |
177 }(bn, b) | 202 }(bn, b) |
178 } | 203 } |
179 | 204 |
180 ret := []messages.Alert{} | 205 ret := []messages.Alert{} |
181 » for bn := range be.Builders { | 206 » for _, bn := range scannedBuilders { |
182 r := <-c | 207 r := <-c |
183 if len(r.err) != 0 { | 208 if len(r.err) != 0 { |
184 // TODO: add a special alert for this too? | 209 // TODO: add a special alert for this too? |
185 log.Errorf("Error getting alerts for builder %s: %v", bn
, r.err) | 210 log.Errorf("Error getting alerts for builder %s: %v", bn
, r.err) |
186 } else { | 211 } else { |
187 ret = append(ret, r.alerts...) | 212 ret = append(ret, r.alerts...) |
188 } | 213 } |
189 } | 214 } |
190 | 215 |
191 return ret | 216 return ret |
192 } | 217 } |
193 | 218 |
194 // masterName extracts the name of the master from the master's URL. | 219 // masterName extracts the name of the master from the master's URL. |
195 func masterName(URL string) (string, error) { | 220 func masterName(URL string) (string, error) { |
196 mURL, err := url.Parse(URL) | 221 mURL, err := url.Parse(URL) |
197 if err != nil { | 222 if err != nil { |
198 return "", err | 223 return "", err |
199 } | 224 } |
200 pathParts := strings.Split(mURL.Path, "/") | 225 pathParts := strings.Split(mURL.Path, "/") |
201 » return pathParts[len(pathParts)-1], nil | 226 » if len(pathParts) < 2 { |
| 227 » » return "", fmt.Errorf("Couldn't parse master name from %s", URL) |
| 228 » } |
| 229 » // -2 to lop the /json off the http://.../{master.name}/json suffix |
| 230 » return pathParts[len(pathParts)-2], nil |
202 } | 231 } |
203 | 232 |
204 func cacheKeyForBuild(master, builder string, number int64) string { | 233 func cacheKeyForBuild(master, builder string, number int64) string { |
205 return filepath.FromSlash( | 234 return filepath.FromSlash( |
206 fmt.Sprintf("%s/%s/%d.json", url.QueryEscape(master), url.QueryE
scape(builder), number)) | 235 fmt.Sprintf("%s/%s/%d.json", url.QueryEscape(master), url.QueryE
scape(builder), number)) |
207 } | 236 } |
208 | 237 |
209 // TODO: actually write the on-disk cache. | 238 // TODO: actually write the on-disk cache. |
210 func filenameForCacheKey(cc string) string { | 239 func filenameForCacheKey(cc string) string { |
211 cc = strings.Replace(cc, "/", "_", -1) | 240 cc = strings.Replace(cc, "/", "_", -1) |
(...skipping 10 matching lines...) Expand all Loading... |
222 v.Add("builder", builder) | 251 v.Add("builder", builder) |
223 | 252 |
224 URL := fmt.Sprintf("https://chrome-build-extract.appspot.com/get_builds?
%s", v.Encode()) | 253 URL := fmt.Sprintf("https://chrome-build-extract.appspot.com/get_builds?
%s", v.Encode()) |
225 res := struct { | 254 res := struct { |
226 Builds []messages.Builds `json:"builds"` | 255 Builds []messages.Builds `json:"builds"` |
227 }{} | 256 }{} |
228 | 257 |
229 // TODO: add FetchBuilds to the client interface. Take a list of {master
, builder} and | 258 // TODO: add FetchBuilds to the client interface. Take a list of {master
, builder} and |
230 // return (map[{master, builder}][]Builds, map [{master, builder}]error) | 259 // return (map[{master, builder}][]Builds, map [{master, builder}]error) |
231 // That way we can do all of these in parallel. | 260 // That way we can do all of these in parallel. |
| 261 |
232 status, err := a.Client.JSON(URL, &res) | 262 status, err := a.Client.JSON(URL, &res) |
233 if err != nil { | 263 if err != nil { |
234 log.Errorf("Error (%d) fetching %s: %s", status, URL, err) | 264 log.Errorf("Error (%d) fetching %s: %s", status, URL, err) |
235 } | 265 } |
236 | 266 |
| 267 a.bLock.Lock() |
237 for _, b := range res.Builds { | 268 for _, b := range res.Builds { |
238 » » a.bCache[cacheKeyForBuild(master, builder, b.Number)] = &b | 269 » » // TODO: consider making res.Builds be []*messages.Builds instea
d of []messages.Builds |
| 270 » » ba := b |
| 271 » » a.bCache[cacheKeyForBuild(master, builder, b.Number)] = &ba |
239 } | 272 } |
| 273 a.bLock.Unlock() |
240 } | 274 } |
241 | 275 |
242 // This type is used for sorting build IDs. | 276 // This type is used for sorting build IDs. |
243 type buildIDs []int64 | 277 type buildIDs []int64 |
244 | 278 |
245 func (a buildIDs) Len() int { return len(a) } | 279 func (a buildIDs) Len() int { return len(a) } |
246 func (a buildIDs) Swap(i, j int) { a[i], a[j] = a[j], a[i] } | 280 func (a buildIDs) Swap(i, j int) { a[i], a[j] = a[j], a[i] } |
247 func (a buildIDs) Less(i, j int) bool { return a[i] > a[j] } | 281 func (a buildIDs) Less(i, j int) bool { return a[i] > a[j] } |
248 | 282 |
249 // latestBuildStep returns the latest build step name and update time, and an er
ror | 283 // latestBuildStep returns the latest build step name and update time, and an er
ror |
(...skipping 33 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
283 recentBuildIDs = recentBuildIDs[:a.MaxRecentBuilds] | 317 recentBuildIDs = recentBuildIDs[:a.MaxRecentBuilds] |
284 } | 318 } |
285 if len(recentBuildIDs) == 0 { | 319 if len(recentBuildIDs) == 0 { |
286 // TODO: Make an alert for this? | 320 // TODO: Make an alert for this? |
287 log.Errorf("No recent builds for %s.%s", mn, bn) | 321 log.Errorf("No recent builds for %s.%s", mn, bn) |
288 return alerts, errs | 322 return alerts, errs |
289 } | 323 } |
290 log.Infof("Checking %d most recent builds for alertable step failures: %
s/%s", len(recentBuildIDs), mn, bn) | 324 log.Infof("Checking %d most recent builds for alertable step failures: %
s/%s", len(recentBuildIDs), mn, bn) |
291 | 325 |
292 // Check for alertable step failures. | 326 // Check for alertable step failures. |
293 » for _, buildID := range recentBuildIDs { | 327 » stepFailureAlerts := map[string]messages.Alert{} |
| 328 » firstFailureInstance := map[string]int64{} |
| 329 |
| 330 » for i, buildID := range recentBuildIDs { |
294 failures, err := a.stepFailures(mn, bn, buildID) | 331 failures, err := a.stepFailures(mn, bn, buildID) |
295 if err != nil { | 332 if err != nil { |
296 errs = append(errs, err) | 333 errs = append(errs, err) |
297 } | 334 } |
| 335 if len(failures) == 0 && i > a.MinRecentBuilds { |
| 336 // Bail as soon as we find a successful build prior to t
he most recent couple of builds. |
| 337 break |
| 338 } |
| 339 |
298 as, err := a.stepFailureAlerts(failures) | 340 as, err := a.stepFailureAlerts(failures) |
299 if err != nil { | 341 if err != nil { |
300 errs = append(errs, err) | 342 errs = append(errs, err) |
301 } | 343 } |
302 » » alerts = append(alerts, as...) | 344 |
| 345 » » lastKey := "" |
| 346 » » // This probably isn't exactly what we want since multiple build
ers may have the same alert firing. |
| 347 » » // This just merges build ranges on a per-builder basis. |
| 348 » » for _, alr := range as { |
| 349 » » » // Only keep the most recent alert. Since recentBuildID
s is |
| 350 » » » // sorted newest build first, ignore older instances of
the alert. |
| 351 » » » if _, ok := stepFailureAlerts[alr.Key]; !ok { |
| 352 » » » » stepFailureAlerts[alr.Key] = alr |
| 353 » » » } |
| 354 » » » if alr.Key == lastKey || lastKey == "" { |
| 355 » » » » if bf, ok := alr.Extension.(messages.BuildFailur
e); ok { |
| 356 » » » » » firstFailureInstance[alr.Key] = bf.Build
ers[0].FirstFailure |
| 357 » » » » } else { |
| 358 » » » » » log.Errorf("Couldn't cast alert extensio
n to BuildFailure: %s", alr.Key) |
| 359 » » » » } |
| 360 » » » } |
| 361 » » » lastKey = alr.Key |
| 362 » » } |
303 } | 363 } |
304 | 364 |
305 » // Check for stale builders. Latest build is the first in the list. | 365 » for _, alr := range stepFailureAlerts { |
| 366 » » alr.Extension.(messages.BuildFailure).Builders[0].FirstFailure =
firstFailureInstance[alr.Key] |
| 367 » » alerts = append(alerts, alr) |
| 368 » } |
| 369 |
| 370 » // Check for stale/idle/offline builders. Latest build is the first in
the list. |
306 lastBuildID := recentBuildIDs[0] | 371 lastBuildID := recentBuildIDs[0] |
307 log.Infof("Checking last build ID: %d", lastBuildID) | 372 log.Infof("Checking last build ID: %d", lastBuildID) |
| 373 |
308 // TODO: get this from cache. | 374 // TODO: get this from cache. |
309 » lastBuild, err := a.Client.Build(mn, bn, lastBuildID) | 375 » a.bLock.Lock() |
310 » if err != nil { | 376 » lastBuild := a.bCache[cacheKeyForBuild(mn, bn, lastBuildID)] |
311 » » errs = append(errs, fmt.Errorf("Couldn't get latest build %d for
%s.%s: %s", lastBuildID, mn, bn, err)) | 377 » a.bLock.Unlock() |
312 » » return alerts, errs | 378 » if lastBuild == nil { |
| 379 » » var err error |
| 380 » » lastBuild, err = a.Client.Build(mn, bn, lastBuildID) |
| 381 » » if err != nil { |
| 382 » » » errs = append(errs, fmt.Errorf("Couldn't get latest buil
d %d for %s.%s: %s", lastBuildID, mn, bn, err)) |
| 383 » » » return alerts, errs |
| 384 » » } |
313 } | 385 } |
314 | 386 |
315 // Examining only the latest build is probably suboptimal since if it's
still in progress it might | 387 // Examining only the latest build is probably suboptimal since if it's
still in progress it might |
316 // not have hit a step that is going to fail and has failed repeatedly f
or the last few builds. | 388 // not have hit a step that is going to fail and has failed repeatedly f
or the last few builds. |
317 // AKA "Reliable failures". TODO: Identify "Reliable failures" | 389 // AKA "Reliable failures". TODO: Identify "Reliable failures" |
318 lastStep, lastUpdated, err := a.latestBuildStep(lastBuild) | 390 lastStep, lastUpdated, err := a.latestBuildStep(lastBuild) |
319 if err != nil { | 391 if err != nil { |
320 errs = append(errs, fmt.Errorf("Couldn't get latest build step f
or %s.%s: %v", mn, bn, err)) | 392 errs = append(errs, fmt.Errorf("Couldn't get latest build step f
or %s.%s: %v", mn, bn, err)) |
321 return alerts, errs | 393 return alerts, errs |
322 | 394 |
323 } | 395 } |
324 elapsed := a.now().Sub(lastUpdated.Time()) | 396 elapsed := a.now().Sub(lastUpdated.Time()) |
325 links := []messages.Link{ | 397 links := []messages.Link{ |
326 {"Builder", fmt.Sprintf("https://build.chromium.org/p/%s/builder
s/%s", mn, bn)}, | 398 {"Builder", fmt.Sprintf("https://build.chromium.org/p/%s/builder
s/%s", mn, bn)}, |
327 {"Last build", fmt.Sprintf("https://build.chromium.org/p/%s/buil
ders/%s/builds/%d", mn, bn, lastBuildID)}, | 399 {"Last build", fmt.Sprintf("https://build.chromium.org/p/%s/buil
ders/%s/builds/%d", mn, bn, lastBuildID)}, |
328 {"Last build step", fmt.Sprintf("https://build.chromium.org/p/%s
/builders/%s/builds/%d/steps/%s", mn, bn, lastBuildID, lastStep)}, | 400 {"Last build step", fmt.Sprintf("https://build.chromium.org/p/%s
/builders/%s/builds/%d/steps/%s", mn, bn, lastBuildID, lastStep)}, |
329 } | 401 } |
330 | 402 |
331 switch b.State { | 403 switch b.State { |
332 case messages.StateBuilding: | 404 case messages.StateBuilding: |
333 if elapsed > a.HungBuilderThresh && lastStep != StepCompletedRun
{ | 405 if elapsed > a.HungBuilderThresh && lastStep != StepCompletedRun
{ |
334 alerts = append(alerts, messages.Alert{ | 406 alerts = append(alerts, messages.Alert{ |
335 Key: fmt.Sprintf("%s.%s.hung", mn, bn), | 407 Key: fmt.Sprintf("%s.%s.hung", mn, bn), |
336 Title: fmt.Sprintf("%s.%s is hung in step %s.
", mn, bn, lastStep), | 408 Title: fmt.Sprintf("%s.%s is hung in step %s.
", mn, bn, lastStep), |
337 Body: fmt.Sprintf("%s.%s has been building f
or %v (last step update %s), past the alerting threshold of %v", mn, bn, elapsed
, lastUpdated.Time(), a.HungBuilderThresh), | 409 Body: fmt.Sprintf("%s.%s has been building f
or %v (last step update %s), past the alerting threshold of %v", mn, bn, elapsed
, lastUpdated.Time(), a.HungBuilderThresh), |
338 » » » » Severity: 0, | 410 » » » » Severity: hungBuilderSev, |
339 Time: messages.TimeToEpochTime(a.now()), | 411 Time: messages.TimeToEpochTime(a.now()), |
340 Links: links, | 412 Links: links, |
341 }) | 413 }) |
342 // Note, just because it's building doesn't mean it's in
a good state. If the last N builds | 414 // Note, just because it's building doesn't mean it's in
a good state. If the last N builds |
343 // all failed (for some large N) then this might still b
e alertable. | 415 // all failed (for some large N) then this might still b
e alertable. |
344 } | 416 } |
345 case messages.StateOffline: | 417 case messages.StateOffline: |
346 if elapsed > a.OfflineBuilderThresh { | 418 if elapsed > a.OfflineBuilderThresh { |
347 alerts = append(alerts, messages.Alert{ | 419 alerts = append(alerts, messages.Alert{ |
348 Key: fmt.Sprintf("%s.%s.offline", mn, bn), | 420 Key: fmt.Sprintf("%s.%s.offline", mn, bn), |
349 Title: fmt.Sprintf("%s.%s is offline.", mn, b
n), | 421 Title: fmt.Sprintf("%s.%s is offline.", mn, b
n), |
350 » » » » Body: fmt.Sprintf("%s.%s has been offline fo
r %v (last step update %s), past the alerting threshold of %v", mn, bn, elapsed,
lastUpdated.Time(), a.OfflineBuilderThresh), | 422 » » » » Body: fmt.Sprintf("%s.%s has been offline fo
r %v (last step update %s %v), past the alerting threshold of %v", mn, bn, elaps
ed, lastUpdated.Time(), float64(lastUpdated), a.OfflineBuilderThresh), |
351 » » » » Severity: 0, | 423 » » » » Severity: offlineBuilderSev, |
352 Time: messages.TimeToEpochTime(a.now()), | 424 Time: messages.TimeToEpochTime(a.now()), |
353 Links: links, | 425 Links: links, |
354 }) | 426 }) |
355 } | 427 } |
356 case messages.StateIdle: | 428 case messages.StateIdle: |
357 if b.PendingBuilds > a.IdleBuilderCountThresh { | 429 if b.PendingBuilds > a.IdleBuilderCountThresh { |
358 alerts = append(alerts, messages.Alert{ | 430 alerts = append(alerts, messages.Alert{ |
359 Key: fmt.Sprintf("%s.%s.idle", mn, bn), | 431 Key: fmt.Sprintf("%s.%s.idle", mn, bn), |
360 Title: fmt.Sprintf("%s.%s is idle with too ma
ny pending builds.", mn, bn), | 432 Title: fmt.Sprintf("%s.%s is idle with too ma
ny pending builds.", mn, bn), |
361 Body: fmt.Sprintf("%s.%s is idle with %d pen
ding builds, past the alerting threshold of %d", mn, bn, b.PendingBuilds, a.Idle
BuilderCountThresh), | 433 Body: fmt.Sprintf("%s.%s is idle with %d pen
ding builds, past the alerting threshold of %d", mn, bn, b.PendingBuilds, a.Idle
BuilderCountThresh), |
362 » » » » Severity: 0, | 434 » » » » Severity: idleBuilderSev, |
363 Time: messages.TimeToEpochTime(a.now()), | 435 Time: messages.TimeToEpochTime(a.now()), |
364 Links: links, | 436 Links: links, |
365 }) | 437 }) |
366 } | 438 } |
367 default: | 439 default: |
368 log.Errorf("Unknown %s.%s builder state: %s", mn, bn, b.State) | 440 log.Errorf("Unknown %s.%s builder state: %s", mn, bn, b.State) |
369 } | 441 } |
370 | 442 |
371 return alerts, errs | 443 return alerts, errs |
372 } | 444 } |
373 | 445 |
374 // stepFailures returns the steps that have failed recently on builder bn. | 446 // stepFailures returns the steps that have failed recently on builder bn. |
375 func (a *MasterAnalyzer) stepFailures(mn string, bn string, bID int64) ([]stepFa
ilure, error) { | 447 func (a *MasterAnalyzer) stepFailures(mn string, bn string, bID int64) ([]stepFa
ilure, error) { |
376 cc := cacheKeyForBuild(mn, bn, bID) | 448 cc := cacheKeyForBuild(mn, bn, bID) |
377 | 449 |
378 var err error // To avoid re-scoping b in the nested conditional below w
ith a :=. | 450 var err error // To avoid re-scoping b in the nested conditional below w
ith a :=. |
| 451 a.bLock.Lock() |
379 b, ok := a.bCache[cc] | 452 b, ok := a.bCache[cc] |
| 453 a.bLock.Unlock() |
380 if !ok { | 454 if !ok { |
381 log.Infof("Cache miss for %s", cc) | 455 log.Infof("Cache miss for %s", cc) |
382 b, err = a.Client.Build(mn, bn, bID) | 456 b, err = a.Client.Build(mn, bn, bID) |
383 if err != nil || b == nil { | 457 if err != nil || b == nil { |
384 log.Errorf("Error fetching build: %v", err) | 458 log.Errorf("Error fetching build: %v", err) |
385 return nil, err | 459 return nil, err |
386 } | 460 } |
387 } | 461 } |
388 | 462 |
389 ret := []stepFailure{} | 463 ret := []stepFailure{} |
| 464 |
390 for _, s := range b.Steps { | 465 for _, s := range b.Steps { |
391 if !s.IsFinished || len(s.Results) == 0 { | 466 if !s.IsFinished || len(s.Results) == 0 { |
392 continue | 467 continue |
393 } | 468 } |
394 // Because Results in the json data is a homogeneous array, the
unmarshaler | 469 // Because Results in the json data is a homogeneous array, the
unmarshaler |
395 // doesn't have any type information to assert about it. We have
to do | 470 // doesn't have any type information to assert about it. We have
to do |
396 // some ugly runtime type assertion ourselves. | 471 // some ugly runtime type assertion ourselves. |
397 if r, ok := s.Results[0].(float64); ok { | 472 if r, ok := s.Results[0].(float64); ok { |
398 if r == 0 || r == 1 { | 473 if r == 0 || r == 1 { |
399 // This 0/1 check seems to be a convention or he
uristic. A 0 or 1 | 474 // This 0/1 check seems to be a convention or he
uristic. A 0 or 1 |
(...skipping 22 matching lines...) Expand all Loading... |
422 ret := []messages.Alert{} | 497 ret := []messages.Alert{} |
423 type res struct { | 498 type res struct { |
424 f stepFailure | 499 f stepFailure |
425 a *messages.Alert | 500 a *messages.Alert |
426 err error | 501 err error |
427 } | 502 } |
428 | 503 |
429 // Might not need full capacity buffer, since some failures are ignored
below. | 504 // Might not need full capacity buffer, since some failures are ignored
below. |
430 rs := make(chan res, len(failures)) | 505 rs := make(chan res, len(failures)) |
431 | 506 |
| 507 scannedFailures := []stepFailure{} |
432 for _, f := range failures { | 508 for _, f := range failures { |
433 // goroutine/channel because the reasonsForFailure call potentia
lly | 509 // goroutine/channel because the reasonsForFailure call potentia
lly |
434 // blocks on IO. | 510 // blocks on IO. |
| 511 if f.step.Name == "steps" { |
| 512 continue |
| 513 // The actual breaking step will appear later. |
| 514 } |
| 515 scannedFailures = append(scannedFailures, f) |
435 go func(f stepFailure) { | 516 go func(f stepFailure) { |
436 alr := messages.Alert{ | 517 alr := messages.Alert{ |
437 Title: fmt.Sprintf("Builder step failure: %s.%s"
, f.masterName, f.builderName), | 518 Title: fmt.Sprintf("Builder step failure: %s.%s"
, f.masterName, f.builderName), |
438 Time: messages.EpochTime(a.now().Unix()), | 519 Time: messages.EpochTime(a.now().Unix()), |
439 Type: "buildfailure", | 520 Type: "buildfailure", |
440 } | 521 } |
441 | 522 |
| 523 // If the builder has been failing on the same step for
multiple builds in a row, |
| 524 // we should have only one alert but indicate the range
of builds affected. |
| 525 // These are set in FirstFailure and LastFailure. |
442 bf := messages.BuildFailure{ | 526 bf := messages.BuildFailure{ |
443 // FIXME: group builders? | 527 // FIXME: group builders? |
444 Builders: []messages.AlertedBuilder{ | 528 Builders: []messages.AlertedBuilder{ |
445 { | 529 { |
446 Name: f.builderName, | 530 Name: f.builderName, |
447 » » » » » » URL: f.URL(), | 531 » » » » » » URL: fmt.Sprintf("http
s://build.chromium.org/p/%s/builders/%s", f.masterName, f.builderName), |
448 » » » » » » FirstFailure: 0, | 532 » » » » » » FirstFailure: f.build.Number, |
449 » » » » » » LatestFailure: 1, | 533 » » » » » » LatestFailure: f.build.Number, |
450 }, | 534 }, |
451 }, | 535 }, |
452 // TODO: RegressionRanges: | 536 // TODO: RegressionRanges: |
453 // look into Builds.SourceStamp.Changes. | 537 // look into Builds.SourceStamp.Changes. |
454 } | 538 } |
455 | 539 |
456 reasons := a.reasonsForFailure(f) | 540 reasons := a.reasonsForFailure(f) |
457 for _, r := range reasons { | 541 for _, r := range reasons { |
458 bf.Reasons = append(bf.Reasons, messages.Reason{ | 542 bf.Reasons = append(bf.Reasons, messages.Reason{ |
459 TestName: r, | 543 TestName: r, |
460 Step: f.step.Name, | 544 Step: f.step.Name, |
| 545 URL: f.URL(), |
461 }) | 546 }) |
462 } | 547 } |
463 | 548 |
464 alr.Extension = bf | 549 alr.Extension = bf |
465 if len(bf.Reasons) == 0 { | 550 if len(bf.Reasons) == 0 { |
466 » » » » log.Warnf("No reasons for step failure: %s", ale
rtKey(f.masterName, f.builderName, f.step.Name, "")) | 551 » » » » alr.Key = alertKey(f.masterName, f.builderName,
f.step.Name, "") |
467 » » » » rs <- res{ | 552 » » » » log.Warnf("No reasons for step failure: %s", alr
.Key) |
468 » » » » » f: f, | |
469 » » » » } | |
470 } else { | 553 } else { |
471 // Should the key include all of the reasons? | 554 // Should the key include all of the reasons? |
472 alr.Key = alertKey(f.masterName, f.builderName,
f.step.Name, reasons[0]) | 555 alr.Key = alertKey(f.masterName, f.builderName,
f.step.Name, reasons[0]) |
473 | 556 » » » } |
474 » » » » rs <- res{ | 557 » » » rs <- res{ |
475 » » » » » f: f, | 558 » » » » f: f, |
476 » » » » » a: &alr, | 559 » » » » a: &alr, |
477 » » » » » err: nil, | 560 » » » » err: nil, |
478 » » » » } | |
479 } | 561 } |
480 }(f) | 562 }(f) |
481 } | 563 } |
482 | 564 |
483 » for _ = range failures { | 565 » for _ = range scannedFailures { |
484 r := <-rs | 566 r := <-rs |
485 if r.a != nil { | 567 if r.a != nil { |
486 ret = append(ret, *r.a) | 568 ret = append(ret, *r.a) |
487 } | 569 } |
488 } | 570 } |
489 | 571 |
490 return ret, nil | 572 return ret, nil |
491 } | 573 } |
492 | 574 |
493 // reasonsForFailure examines the step failure and applies some heuristics to | 575 // reasonsForFailure examines the step failure and applies some heuristics to |
494 // to find the cause. It may make blocking IO calls in the process. | 576 // to find the cause. It may make blocking IO calls in the process. |
495 func (a *MasterAnalyzer) reasonsForFailure(f stepFailure) []string { | 577 func (a *MasterAnalyzer) reasonsForFailure(f stepFailure) []string { |
496 ret := []string{} | 578 ret := []string{} |
497 recognized := false | 579 recognized := false |
498 log.Infof("Checking for reasons for failure step: %v", f.step.Name) | 580 log.Infof("Checking for reasons for failure step: %v", f.step.Name) |
499 for _, sfa := range a.StepAnalyzers { | 581 for _, sfa := range a.StepAnalyzers { |
500 res, err := sfa.Analyze(f) | 582 res, err := sfa.Analyze(f) |
501 if err != nil { | 583 if err != nil { |
| 584 // TODO: return something that contains errors *and* rea
sons. |
502 log.Errorf("Error get reasons from StepAnalyzer: %v", er
r) | 585 log.Errorf("Error get reasons from StepAnalyzer: %v", er
r) |
| 586 continue |
503 } | 587 } |
504 if res.Recognized { | 588 if res.Recognized { |
505 recognized = true | 589 recognized = true |
506 ret = append(ret, res.Reasons...) | 590 ret = append(ret, res.Reasons...) |
507 } | 591 } |
508 } | 592 } |
509 | 593 |
510 if !recognized { | 594 if !recognized { |
511 // TODO: log and report frequently encountered unrecognized buil
der step | 595 // TODO: log and report frequently encountered unrecognized buil
der step |
512 // failure names. | 596 // failure names. |
(...skipping 39 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
552 // Sigh. build.chromium.org doesn't accept + as an escaped space in URL paths. | 636 // Sigh. build.chromium.org doesn't accept + as an escaped space in URL paths. |
553 func oldEscape(s string) string { | 637 func oldEscape(s string) string { |
554 return strings.Replace(url.QueryEscape(s), "+", "%20", -1) | 638 return strings.Replace(url.QueryEscape(s), "+", "%20", -1) |
555 } | 639 } |
556 | 640 |
557 // URL returns a url to builder step failure page. | 641 // URL returns a url to builder step failure page. |
558 func (f stepFailure) URL() string { | 642 func (f stepFailure) URL() string { |
559 return fmt.Sprintf("https://build.chromium.org/p/%s/builders/%s/builds/%
d/steps/%s", | 643 return fmt.Sprintf("https://build.chromium.org/p/%s/builders/%s/builds/%
d/steps/%s", |
560 f.masterName, oldEscape(f.builderName), f.build.Number, oldEscap
e(f.step.Name)) | 644 f.masterName, oldEscape(f.builderName), f.build.Number, oldEscap
e(f.step.Name)) |
561 } | 645 } |
OLD | NEW |