Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(567)

Side by Side Diff: go/src/infra/monitoring/analyzer/analyzer.go

Issue 1125263004: dispatcher: fix test result parsing, build ranges for failure alerts, other fixes (Closed) Base URL: https://chromium.googlesource.com/infra/infra.git@master
Patch Set: added comments, some bounds/type assertion checks Created 5 years, 7 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 // Copyright 2015 The Chromium Authors. All rights reserved. 1 // Copyright 2015 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 package analyzer 5 package analyzer
6 6
7 import ( 7 import (
8 "errors" 8 "errors"
9 "fmt" 9 "fmt"
10 "net/url" 10 "net/url"
11 "path/filepath" 11 "path/filepath"
12 "sort" 12 "sort"
13 "strings" 13 "strings"
14 "sync"
14 "time" 15 "time"
15 16
16 "github.com/Sirupsen/logrus" 17 "github.com/Sirupsen/logrus"
17 18
18 "infra/monitoring/client" 19 "infra/monitoring/client"
19 "infra/monitoring/messages" 20 "infra/monitoring/messages"
20 ) 21 )
21 22
22 const ( 23 const (
23 // StepCompletedRun is a synthetic step name used to indicate the build run is complete. 24 // StepCompletedRun is a synthetic step name used to indicate the build run is complete.
24 » StepCompletedRun = "completed run" 25 » StepCompletedRun = "completed run"
26 » treeCloserPri = 0
27 » staleMasterSev = 0
28 » staleBuilderSev = 0
29 » hungBuilderSev = 1
30 » idleBuilderSev = 1
31 » offlineBuilderSev = 1
25 ) 32 )
26 33
27 var ( 34 var (
28 log = logrus.New() 35 log = logrus.New()
29 ) 36 )
30 37
31 var ( 38 var (
32 errNoBuildSteps = errors.New("No build steps") 39 errNoBuildSteps = errors.New("No build steps")
33 ) 40 )
34 41
(...skipping 14 matching lines...) Expand all
49 // Reasons lists the reasons for the stepFailure determined by the StepF ailureAnalyzer. 56 // Reasons lists the reasons for the stepFailure determined by the StepF ailureAnalyzer.
50 Reasons []string 57 Reasons []string
51 } 58 }
52 59
53 // MasterAnalyzer runs the process of checking masters, builders, test results a nd so on, 60 // MasterAnalyzer runs the process of checking masters, builders, test results a nd so on,
54 // in order to produce alerts. 61 // in order to produce alerts.
55 type MasterAnalyzer struct { 62 type MasterAnalyzer struct {
56 // MaxRecentBuilds is the maximum number of recent builds to check, per builder. 63 // MaxRecentBuilds is the maximum number of recent builds to check, per builder.
57 MaxRecentBuilds int 64 MaxRecentBuilds int
58 65
66 // MinRecentBuilds is the minimum number of recent builds to check, per builder.
67 MinRecentBuilds int
68
59 // StepAnalzers are the set of build step failure analyzers to be checke d on 69 // StepAnalzers are the set of build step failure analyzers to be checke d on
60 // build step failures. 70 // build step failures.
61 StepAnalyzers []StepAnalyzer 71 StepAnalyzers []StepAnalyzer
62 72
63 // Client is the Client implementation for fetching json from CBE, build s, etc. 73 // Client is the Client implementation for fetching json from CBE, build s, etc.
64 Client client.Client 74 Client client.Client
65 75
66 // HungBuilerThresh is the maxumum length of time a builder may be in st ate "building" 76 // HungBuilerThresh is the maxumum length of time a builder may be in st ate "building"
67 // before triggering a "hung builder" alert. 77 // before triggering a "hung builder" alert.
68 HungBuilderThresh time.Duration 78 HungBuilderThresh time.Duration
69 79
70 // OfflineBuilderThresh is the maximum length of time a builder may be i n state "offline" 80 // OfflineBuilderThresh is the maximum length of time a builder may be i n state "offline"
71 // before triggering an "offline builder" alert. 81 // before triggering an "offline builder" alert.
72 OfflineBuilderThresh time.Duration 82 OfflineBuilderThresh time.Duration
73 83
74 // IdleBuilderCountThresh is the maximum number of builds a builder may have in queue 84 // IdleBuilderCountThresh is the maximum number of builds a builder may have in queue
75 // while in the "idle" state before triggering an "idle builder" alert. 85 // while in the "idle" state before triggering an "idle builder" alert.
76 IdleBuilderCountThresh int64 86 IdleBuilderCountThresh int64
77 87
78 // StaleMasterThreshold is the maximum age that master data from CBE can be before 88 // StaleMasterThreshold is the maximum age that master data from CBE can be before
79 // triggering a "stale master" alert. 89 // triggering a "stale master" alert.
80 StaleMasterThreshold time.Duration 90 StaleMasterThreshold time.Duration
81 91
82 // bCache is a map of build cache key to Build message. 92 // bCache is a map of build cache key to Build message.
83 bCache map[string]*messages.Builds 93 bCache map[string]*messages.Builds
94 // bLock protects bCache
95 bLock *sync.Mutex
96
97 // These limit the scope analysis, useful for debugging.
98 TreeOnly string
99 MasterOnly string
100 BuilderOnly string
101 BuildOnly int64
84 102
85 // now is useful for mocking the system clock in testing. 103 // now is useful for mocking the system clock in testing.
86 now func() time.Time 104 now func() time.Time
87 } 105 }
88 106
89 // New returns a new Analyzer. If client is nil, it assigns a default implementa tion. 107 // New returns a new Analyzer. If client is nil, it assigns a default implementa tion.
90 // maxBuilds is the maximum number of builds to check, per builder. 108 // maxBuilds is the maximum number of builds to check, per builder.
91 func New(c client.Client, maxBuilds int) *MasterAnalyzer { 109 func New(c client.Client, minBuilds, maxBuilds int) *MasterAnalyzer {
92 if c == nil { 110 if c == nil {
93 c = client.New() 111 c = client.New()
94 } 112 }
95 113
96 return &MasterAnalyzer{ 114 return &MasterAnalyzer{
97 Client: c, 115 Client: c,
98 MaxRecentBuilds: maxBuilds, 116 MaxRecentBuilds: maxBuilds,
117 MinRecentBuilds: minBuilds,
99 HungBuilderThresh: 3 * time.Hour, 118 HungBuilderThresh: 3 * time.Hour,
100 OfflineBuilderThresh: 90 * time.Minute, 119 OfflineBuilderThresh: 90 * time.Minute,
101 IdleBuilderCountThresh: 50, 120 IdleBuilderCountThresh: 50,
102 StaleMasterThreshold: 10 * time.Minute, 121 StaleMasterThreshold: 10 * time.Minute,
103 StepAnalyzers: []StepAnalyzer{ 122 StepAnalyzers: []StepAnalyzer{
104 &TestFailureAnalyzer{Client: c}, 123 &TestFailureAnalyzer{Client: c},
124 &CompileFailureAnalyzer{Client: c},
105 }, 125 },
106 126
107 now: func() time.Time { 127 now: func() time.Time {
108 return time.Now() 128 return time.Now()
109 }, 129 },
110 130
111 bCache: map[string]*messages.Builds{}, 131 bCache: map[string]*messages.Builds{},
132 bLock: &sync.Mutex{},
112 } 133 }
113 } 134 }
114 135
115 // MasterAlerts returns alerts generated from the master at URL. 136 // MasterAlerts returns alerts generated from the master at URL.
116 func (a *MasterAnalyzer) MasterAlerts(url string, be *messages.BuildExtract) []m essages.Alert { 137 func (a *MasterAnalyzer) MasterAlerts(url string, be *messages.BuildExtract) []m essages.Alert {
117 ret := []messages.Alert{} 138 ret := []messages.Alert{}
118 139
119 // Copied logic from builder_messages. 140 // Copied logic from builder_messages.
120 // No created_timestamp should be a warning sign, no? 141 // No created_timestamp should be a warning sign, no?
121 if be.CreatedTimestamp == messages.EpochTime(0) { 142 if be.CreatedTimestamp == messages.EpochTime(0) {
122 return ret 143 return ret
123 } 144 }
124 145
125 elapsed := a.now().Sub(be.CreatedTimestamp.Time()) 146 elapsed := a.now().Sub(be.CreatedTimestamp.Time())
126 if elapsed > a.StaleMasterThreshold { 147 if elapsed > a.StaleMasterThreshold {
127 ret = append(ret, messages.Alert{ 148 ret = append(ret, messages.Alert{
128 Key: fmt.Sprintf("stale master: %v", url), 149 Key: fmt.Sprintf("stale master: %v", url),
129 Title: "Stale Master Data", 150 Title: "Stale Master Data",
130 Body: fmt.Sprintf("%s elapsed since last update (%s) .", elapsed, be.CreatedTimestamp.Time()), 151 Body: fmt.Sprintf("%s elapsed since last update (%s) .", elapsed, be.CreatedTimestamp.Time()),
131 » » » Severity: 0, 152 » » » Severity: staleMasterSev,
132 Time: messages.TimeToEpochTime(a.now()), 153 Time: messages.TimeToEpochTime(a.now()),
133 Links: []messages.Link{{"Master", url}}, 154 Links: []messages.Link{{"Master", url}},
134 // No type or extension for now. 155 // No type or extension for now.
135 }) 156 })
136 } 157 }
137 if elapsed < 0 { 158 if elapsed < 0 {
138 // Add this to the alerts returned, rather than just log it? 159 // Add this to the alerts returned, rather than just log it?
139 log.Errorf("Master %s timestamp is newer than current time (%s): %s old.", url, a.now(), elapsed) 160 log.Errorf("Master %s timestamp is newer than current time (%s): %s old.", url, a.now(), elapsed)
140 } 161 }
141 162
(...skipping 11 matching lines...) Expand all
153 type r struct { 174 type r struct {
154 bn string 175 bn string
155 b messages.Builders 176 b messages.Builders
156 alerts []messages.Alert 177 alerts []messages.Alert
157 err []error 178 err []error
158 } 179 }
159 c := make(chan r, len(be.Builders)) 180 c := make(chan r, len(be.Builders))
160 181
161 // TODO: get a list of all the running builds from be.Slaves? It 182 // TODO: get a list of all the running builds from be.Slaves? It
162 // appears to be used later on in the original py. 183 // appears to be used later on in the original py.
184 scannedBuilders := []string{}
163 for bn, b := range be.Builders { 185 for bn, b := range be.Builders {
186 if a.BuilderOnly != "" && bn != a.BuilderOnly {
187 continue
188 }
189 scannedBuilders = append(scannedBuilders, bn)
164 go func(bn string, b messages.Builders) { 190 go func(bn string, b messages.Builders) {
165 out := r{bn: bn, b: b} 191 out := r{bn: bn, b: b}
166 defer func() { 192 defer func() {
167 c <- out 193 c <- out
168 }() 194 }()
169 195
170 // This blocks on IO, hence the goroutine. 196 // This blocks on IO, hence the goroutine.
171 a.warmBuildCache(mn, bn, b.CachedBuilds) 197 a.warmBuildCache(mn, bn, b.CachedBuilds)
172
173 // Each call to builderAlerts may trigger blocking json fetches, 198 // Each call to builderAlerts may trigger blocking json fetches,
174 // but it has a data dependency on the above cache-warmi ng call, so 199 // but it has a data dependency on the above cache-warmi ng call, so
175 // the logic remains serial. 200 // the logic remains serial.
176 out.alerts, out.err = a.builderAlerts(mn, bn, &b) 201 out.alerts, out.err = a.builderAlerts(mn, bn, &b)
177 }(bn, b) 202 }(bn, b)
178 } 203 }
179 204
180 ret := []messages.Alert{} 205 ret := []messages.Alert{}
181 » for bn := range be.Builders { 206 » for _, bn := range scannedBuilders {
182 r := <-c 207 r := <-c
183 if len(r.err) != 0 { 208 if len(r.err) != 0 {
184 // TODO: add a special alert for this too? 209 // TODO: add a special alert for this too?
185 log.Errorf("Error getting alerts for builder %s: %v", bn , r.err) 210 log.Errorf("Error getting alerts for builder %s: %v", bn , r.err)
186 } else { 211 } else {
187 ret = append(ret, r.alerts...) 212 ret = append(ret, r.alerts...)
188 } 213 }
189 } 214 }
190 215
191 return ret 216 return ret
192 } 217 }
193 218
194 // masterName extracts the name of the master from the master's URL. 219 // masterName extracts the name of the master from the master's URL.
195 func masterName(URL string) (string, error) { 220 func masterName(URL string) (string, error) {
196 mURL, err := url.Parse(URL) 221 mURL, err := url.Parse(URL)
197 if err != nil { 222 if err != nil {
198 return "", err 223 return "", err
199 } 224 }
200 pathParts := strings.Split(mURL.Path, "/") 225 pathParts := strings.Split(mURL.Path, "/")
201 » return pathParts[len(pathParts)-1], nil 226 » if len(pathParts) < 2 {
227 » » return "", fmt.Errorf("Couldn't parse master name from %s", URL)
228 » }
229 » // -2 to lop the /json off the http://.../{master.name}/json suffix
230 » return pathParts[len(pathParts)-2], nil
202 } 231 }
203 232
204 func cacheKeyForBuild(master, builder string, number int64) string { 233 func cacheKeyForBuild(master, builder string, number int64) string {
205 return filepath.FromSlash( 234 return filepath.FromSlash(
206 fmt.Sprintf("%s/%s/%d.json", url.QueryEscape(master), url.QueryE scape(builder), number)) 235 fmt.Sprintf("%s/%s/%d.json", url.QueryEscape(master), url.QueryE scape(builder), number))
207 } 236 }
208 237
209 // TODO: actually write the on-disk cache. 238 // TODO: actually write the on-disk cache.
210 func filenameForCacheKey(cc string) string { 239 func filenameForCacheKey(cc string) string {
211 cc = strings.Replace(cc, "/", "_", -1) 240 cc = strings.Replace(cc, "/", "_", -1)
(...skipping 10 matching lines...) Expand all
222 v.Add("builder", builder) 251 v.Add("builder", builder)
223 252
224 URL := fmt.Sprintf("https://chrome-build-extract.appspot.com/get_builds? %s", v.Encode()) 253 URL := fmt.Sprintf("https://chrome-build-extract.appspot.com/get_builds? %s", v.Encode())
225 res := struct { 254 res := struct {
226 Builds []messages.Builds `json:"builds"` 255 Builds []messages.Builds `json:"builds"`
227 }{} 256 }{}
228 257
229 // TODO: add FetchBuilds to the client interface. Take a list of {master , builder} and 258 // TODO: add FetchBuilds to the client interface. Take a list of {master , builder} and
230 // return (map[{master, builder}][]Builds, map [{master, builder}]error) 259 // return (map[{master, builder}][]Builds, map [{master, builder}]error)
231 // That way we can do all of these in parallel. 260 // That way we can do all of these in parallel.
261
232 status, err := a.Client.JSON(URL, &res) 262 status, err := a.Client.JSON(URL, &res)
233 if err != nil { 263 if err != nil {
234 log.Errorf("Error (%d) fetching %s: %s", status, URL, err) 264 log.Errorf("Error (%d) fetching %s: %s", status, URL, err)
235 } 265 }
236 266
267 a.bLock.Lock()
237 for _, b := range res.Builds { 268 for _, b := range res.Builds {
238 » » a.bCache[cacheKeyForBuild(master, builder, b.Number)] = &b 269 » » // TODO: consider making res.Builds be []*messages.Builds instea d of []messages.Builds
270 » » ba := b
271 » » a.bCache[cacheKeyForBuild(master, builder, b.Number)] = &ba
239 } 272 }
273 a.bLock.Unlock()
240 } 274 }
241 275
242 // This type is used for sorting build IDs. 276 // This type is used for sorting build IDs.
243 type buildIDs []int64 277 type buildIDs []int64
244 278
245 func (a buildIDs) Len() int { return len(a) } 279 func (a buildIDs) Len() int { return len(a) }
246 func (a buildIDs) Swap(i, j int) { a[i], a[j] = a[j], a[i] } 280 func (a buildIDs) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
247 func (a buildIDs) Less(i, j int) bool { return a[i] > a[j] } 281 func (a buildIDs) Less(i, j int) bool { return a[i] > a[j] }
248 282
249 // latestBuildStep returns the latest build step name and update time, and an er ror 283 // latestBuildStep returns the latest build step name and update time, and an er ror
(...skipping 33 matching lines...) Expand 10 before | Expand all | Expand 10 after
283 recentBuildIDs = recentBuildIDs[:a.MaxRecentBuilds] 317 recentBuildIDs = recentBuildIDs[:a.MaxRecentBuilds]
284 } 318 }
285 if len(recentBuildIDs) == 0 { 319 if len(recentBuildIDs) == 0 {
286 // TODO: Make an alert for this? 320 // TODO: Make an alert for this?
287 log.Errorf("No recent builds for %s.%s", mn, bn) 321 log.Errorf("No recent builds for %s.%s", mn, bn)
288 return alerts, errs 322 return alerts, errs
289 } 323 }
290 log.Infof("Checking %d most recent builds for alertable step failures: % s/%s", len(recentBuildIDs), mn, bn) 324 log.Infof("Checking %d most recent builds for alertable step failures: % s/%s", len(recentBuildIDs), mn, bn)
291 325
292 // Check for alertable step failures. 326 // Check for alertable step failures.
293 » for _, buildID := range recentBuildIDs { 327 » stepFailureAlerts := map[string]messages.Alert{}
328 » firstFailureInstance := map[string]int64{}
329
330 » for i, buildID := range recentBuildIDs {
294 failures, err := a.stepFailures(mn, bn, buildID) 331 failures, err := a.stepFailures(mn, bn, buildID)
295 if err != nil { 332 if err != nil {
296 errs = append(errs, err) 333 errs = append(errs, err)
297 } 334 }
335 if len(failures) == 0 && i > a.MinRecentBuilds {
336 // Bail as soon as we find a successful build prior to t he most recent couple of builds.
337 break
338 }
339
298 as, err := a.stepFailureAlerts(failures) 340 as, err := a.stepFailureAlerts(failures)
299 if err != nil { 341 if err != nil {
300 errs = append(errs, err) 342 errs = append(errs, err)
301 } 343 }
302 » » alerts = append(alerts, as...) 344
345 » » lastKey := ""
346 » » // This probably isn't exactly what we want since multiple build ers may have the same alert firing.
347 » » // This just merges build ranges on a per-builder basis.
348 » » for _, alr := range as {
349 » » » // Only keep the most recent alert. Since recentBuildID s is
350 » » » // sorted newest build first, ignore older instances of the alert.
351 » » » if _, ok := stepFailureAlerts[alr.Key]; !ok {
352 » » » » stepFailureAlerts[alr.Key] = alr
353 » » » }
354 » » » if alr.Key == lastKey || lastKey == "" {
355 » » » » if bf, ok := alr.Extension.(messages.BuildFailur e); ok {
356 » » » » » firstFailureInstance[alr.Key] = bf.Build ers[0].FirstFailure
357 » » » » } else {
358 » » » » » log.Errorf("Couldn't cast alert extensio n to BuildFailure: %s", alr.Key)
359 » » » » }
360 » » » }
361 » » » lastKey = alr.Key
362 » » }
303 } 363 }
304 364
305 » // Check for stale builders. Latest build is the first in the list. 365 » for _, alr := range stepFailureAlerts {
366 » » alr.Extension.(messages.BuildFailure).Builders[0].FirstFailure = firstFailureInstance[alr.Key]
367 » » alerts = append(alerts, alr)
368 » }
369
370 » // Check for stale/idle/offline builders. Latest build is the first in the list.
306 lastBuildID := recentBuildIDs[0] 371 lastBuildID := recentBuildIDs[0]
307 log.Infof("Checking last build ID: %d", lastBuildID) 372 log.Infof("Checking last build ID: %d", lastBuildID)
373
308 // TODO: get this from cache. 374 // TODO: get this from cache.
309 » lastBuild, err := a.Client.Build(mn, bn, lastBuildID) 375 » a.bLock.Lock()
310 » if err != nil { 376 » lastBuild := a.bCache[cacheKeyForBuild(mn, bn, lastBuildID)]
311 » » errs = append(errs, fmt.Errorf("Couldn't get latest build %d for %s.%s: %s", lastBuildID, mn, bn, err)) 377 » a.bLock.Unlock()
312 » » return alerts, errs 378 » if lastBuild == nil {
379 » » var err error
380 » » lastBuild, err = a.Client.Build(mn, bn, lastBuildID)
381 » » if err != nil {
382 » » » errs = append(errs, fmt.Errorf("Couldn't get latest buil d %d for %s.%s: %s", lastBuildID, mn, bn, err))
383 » » » return alerts, errs
384 » » }
313 } 385 }
314 386
315 // Examining only the latest build is probably suboptimal since if it's still in progress it might 387 // Examining only the latest build is probably suboptimal since if it's still in progress it might
316 // not have hit a step that is going to fail and has failed repeatedly f or the last few builds. 388 // not have hit a step that is going to fail and has failed repeatedly f or the last few builds.
317 // AKA "Reliable failures". TODO: Identify "Reliable failures" 389 // AKA "Reliable failures". TODO: Identify "Reliable failures"
318 lastStep, lastUpdated, err := a.latestBuildStep(lastBuild) 390 lastStep, lastUpdated, err := a.latestBuildStep(lastBuild)
319 if err != nil { 391 if err != nil {
320 errs = append(errs, fmt.Errorf("Couldn't get latest build step f or %s.%s: %v", mn, bn, err)) 392 errs = append(errs, fmt.Errorf("Couldn't get latest build step f or %s.%s: %v", mn, bn, err))
321 return alerts, errs 393 return alerts, errs
322 394
323 } 395 }
324 elapsed := a.now().Sub(lastUpdated.Time()) 396 elapsed := a.now().Sub(lastUpdated.Time())
325 links := []messages.Link{ 397 links := []messages.Link{
326 {"Builder", fmt.Sprintf("https://build.chromium.org/p/%s/builder s/%s", mn, bn)}, 398 {"Builder", fmt.Sprintf("https://build.chromium.org/p/%s/builder s/%s", mn, bn)},
327 {"Last build", fmt.Sprintf("https://build.chromium.org/p/%s/buil ders/%s/builds/%d", mn, bn, lastBuildID)}, 399 {"Last build", fmt.Sprintf("https://build.chromium.org/p/%s/buil ders/%s/builds/%d", mn, bn, lastBuildID)},
328 {"Last build step", fmt.Sprintf("https://build.chromium.org/p/%s /builders/%s/builds/%d/steps/%s", mn, bn, lastBuildID, lastStep)}, 400 {"Last build step", fmt.Sprintf("https://build.chromium.org/p/%s /builders/%s/builds/%d/steps/%s", mn, bn, lastBuildID, lastStep)},
329 } 401 }
330 402
331 switch b.State { 403 switch b.State {
332 case messages.StateBuilding: 404 case messages.StateBuilding:
333 if elapsed > a.HungBuilderThresh && lastStep != StepCompletedRun { 405 if elapsed > a.HungBuilderThresh && lastStep != StepCompletedRun {
334 alerts = append(alerts, messages.Alert{ 406 alerts = append(alerts, messages.Alert{
335 Key: fmt.Sprintf("%s.%s.hung", mn, bn), 407 Key: fmt.Sprintf("%s.%s.hung", mn, bn),
336 Title: fmt.Sprintf("%s.%s is hung in step %s. ", mn, bn, lastStep), 408 Title: fmt.Sprintf("%s.%s is hung in step %s. ", mn, bn, lastStep),
337 Body: fmt.Sprintf("%s.%s has been building f or %v (last step update %s), past the alerting threshold of %v", mn, bn, elapsed , lastUpdated.Time(), a.HungBuilderThresh), 409 Body: fmt.Sprintf("%s.%s has been building f or %v (last step update %s), past the alerting threshold of %v", mn, bn, elapsed , lastUpdated.Time(), a.HungBuilderThresh),
338 » » » » Severity: 0, 410 » » » » Severity: hungBuilderSev,
339 Time: messages.TimeToEpochTime(a.now()), 411 Time: messages.TimeToEpochTime(a.now()),
340 Links: links, 412 Links: links,
341 }) 413 })
342 // Note, just because it's building doesn't mean it's in a good state. If the last N builds 414 // Note, just because it's building doesn't mean it's in a good state. If the last N builds
343 // all failed (for some large N) then this might still b e alertable. 415 // all failed (for some large N) then this might still b e alertable.
344 } 416 }
345 case messages.StateOffline: 417 case messages.StateOffline:
346 if elapsed > a.OfflineBuilderThresh { 418 if elapsed > a.OfflineBuilderThresh {
347 alerts = append(alerts, messages.Alert{ 419 alerts = append(alerts, messages.Alert{
348 Key: fmt.Sprintf("%s.%s.offline", mn, bn), 420 Key: fmt.Sprintf("%s.%s.offline", mn, bn),
349 Title: fmt.Sprintf("%s.%s is offline.", mn, b n), 421 Title: fmt.Sprintf("%s.%s is offline.", mn, b n),
350 » » » » Body: fmt.Sprintf("%s.%s has been offline fo r %v (last step update %s), past the alerting threshold of %v", mn, bn, elapsed, lastUpdated.Time(), a.OfflineBuilderThresh), 422 » » » » Body: fmt.Sprintf("%s.%s has been offline fo r %v (last step update %s %v), past the alerting threshold of %v", mn, bn, elaps ed, lastUpdated.Time(), float64(lastUpdated), a.OfflineBuilderThresh),
351 » » » » Severity: 0, 423 » » » » Severity: offlineBuilderSev,
352 Time: messages.TimeToEpochTime(a.now()), 424 Time: messages.TimeToEpochTime(a.now()),
353 Links: links, 425 Links: links,
354 }) 426 })
355 } 427 }
356 case messages.StateIdle: 428 case messages.StateIdle:
357 if b.PendingBuilds > a.IdleBuilderCountThresh { 429 if b.PendingBuilds > a.IdleBuilderCountThresh {
358 alerts = append(alerts, messages.Alert{ 430 alerts = append(alerts, messages.Alert{
359 Key: fmt.Sprintf("%s.%s.idle", mn, bn), 431 Key: fmt.Sprintf("%s.%s.idle", mn, bn),
360 Title: fmt.Sprintf("%s.%s is idle with too ma ny pending builds.", mn, bn), 432 Title: fmt.Sprintf("%s.%s is idle with too ma ny pending builds.", mn, bn),
361 Body: fmt.Sprintf("%s.%s is idle with %d pen ding builds, past the alerting threshold of %d", mn, bn, b.PendingBuilds, a.Idle BuilderCountThresh), 433 Body: fmt.Sprintf("%s.%s is idle with %d pen ding builds, past the alerting threshold of %d", mn, bn, b.PendingBuilds, a.Idle BuilderCountThresh),
362 » » » » Severity: 0, 434 » » » » Severity: idleBuilderSev,
363 Time: messages.TimeToEpochTime(a.now()), 435 Time: messages.TimeToEpochTime(a.now()),
364 Links: links, 436 Links: links,
365 }) 437 })
366 } 438 }
367 default: 439 default:
368 log.Errorf("Unknown %s.%s builder state: %s", mn, bn, b.State) 440 log.Errorf("Unknown %s.%s builder state: %s", mn, bn, b.State)
369 } 441 }
370 442
371 return alerts, errs 443 return alerts, errs
372 } 444 }
373 445
374 // stepFailures returns the steps that have failed recently on builder bn. 446 // stepFailures returns the steps that have failed recently on builder bn.
375 func (a *MasterAnalyzer) stepFailures(mn string, bn string, bID int64) ([]stepFa ilure, error) { 447 func (a *MasterAnalyzer) stepFailures(mn string, bn string, bID int64) ([]stepFa ilure, error) {
376 cc := cacheKeyForBuild(mn, bn, bID) 448 cc := cacheKeyForBuild(mn, bn, bID)
377 449
378 var err error // To avoid re-scoping b in the nested conditional below w ith a :=. 450 var err error // To avoid re-scoping b in the nested conditional below w ith a :=.
451 a.bLock.Lock()
379 b, ok := a.bCache[cc] 452 b, ok := a.bCache[cc]
453 a.bLock.Unlock()
380 if !ok { 454 if !ok {
381 log.Infof("Cache miss for %s", cc) 455 log.Infof("Cache miss for %s", cc)
382 b, err = a.Client.Build(mn, bn, bID) 456 b, err = a.Client.Build(mn, bn, bID)
383 if err != nil || b == nil { 457 if err != nil || b == nil {
384 log.Errorf("Error fetching build: %v", err) 458 log.Errorf("Error fetching build: %v", err)
385 return nil, err 459 return nil, err
386 } 460 }
387 } 461 }
388 462
389 ret := []stepFailure{} 463 ret := []stepFailure{}
464
390 for _, s := range b.Steps { 465 for _, s := range b.Steps {
391 if !s.IsFinished || len(s.Results) == 0 { 466 if !s.IsFinished || len(s.Results) == 0 {
392 continue 467 continue
393 } 468 }
394 // Because Results in the json data is a homogeneous array, the unmarshaler 469 // Because Results in the json data is a homogeneous array, the unmarshaler
395 // doesn't have any type information to assert about it. We have to do 470 // doesn't have any type information to assert about it. We have to do
396 // some ugly runtime type assertion ourselves. 471 // some ugly runtime type assertion ourselves.
397 if r, ok := s.Results[0].(float64); ok { 472 if r, ok := s.Results[0].(float64); ok {
398 if r == 0 || r == 1 { 473 if r == 0 || r == 1 {
399 // This 0/1 check seems to be a convention or he uristic. A 0 or 1 474 // This 0/1 check seems to be a convention or he uristic. A 0 or 1
(...skipping 22 matching lines...) Expand all
422 ret := []messages.Alert{} 497 ret := []messages.Alert{}
423 type res struct { 498 type res struct {
424 f stepFailure 499 f stepFailure
425 a *messages.Alert 500 a *messages.Alert
426 err error 501 err error
427 } 502 }
428 503
429 // Might not need full capacity buffer, since some failures are ignored below. 504 // Might not need full capacity buffer, since some failures are ignored below.
430 rs := make(chan res, len(failures)) 505 rs := make(chan res, len(failures))
431 506
507 scannedFailures := []stepFailure{}
432 for _, f := range failures { 508 for _, f := range failures {
433 // goroutine/channel because the reasonsForFailure call potentia lly 509 // goroutine/channel because the reasonsForFailure call potentia lly
434 // blocks on IO. 510 // blocks on IO.
511 if f.step.Name == "steps" {
512 continue
513 // The actual breaking step will appear later.
514 }
515 scannedFailures = append(scannedFailures, f)
435 go func(f stepFailure) { 516 go func(f stepFailure) {
436 alr := messages.Alert{ 517 alr := messages.Alert{
437 Title: fmt.Sprintf("Builder step failure: %s.%s" , f.masterName, f.builderName), 518 Title: fmt.Sprintf("Builder step failure: %s.%s" , f.masterName, f.builderName),
438 Time: messages.EpochTime(a.now().Unix()), 519 Time: messages.EpochTime(a.now().Unix()),
439 Type: "buildfailure", 520 Type: "buildfailure",
440 } 521 }
441 522
523 // If the builder has been failing on the same step for multiple builds in a row,
524 // we should have only one alert but indicate the range of builds affected.
525 // These are set in FirstFailure and LastFailure.
442 bf := messages.BuildFailure{ 526 bf := messages.BuildFailure{
443 // FIXME: group builders? 527 // FIXME: group builders?
444 Builders: []messages.AlertedBuilder{ 528 Builders: []messages.AlertedBuilder{
445 { 529 {
446 Name: f.builderName, 530 Name: f.builderName,
447 » » » » » » URL: f.URL(), 531 » » » » » » URL: fmt.Sprintf("http s://build.chromium.org/p/%s/builders/%s", f.masterName, f.builderName),
448 » » » » » » FirstFailure: 0, 532 » » » » » » FirstFailure: f.build.Number,
449 » » » » » » LatestFailure: 1, 533 » » » » » » LatestFailure: f.build.Number,
450 }, 534 },
451 }, 535 },
452 // TODO: RegressionRanges: 536 // TODO: RegressionRanges:
453 // look into Builds.SourceStamp.Changes. 537 // look into Builds.SourceStamp.Changes.
454 } 538 }
455 539
456 reasons := a.reasonsForFailure(f) 540 reasons := a.reasonsForFailure(f)
457 for _, r := range reasons { 541 for _, r := range reasons {
458 bf.Reasons = append(bf.Reasons, messages.Reason{ 542 bf.Reasons = append(bf.Reasons, messages.Reason{
459 TestName: r, 543 TestName: r,
460 Step: f.step.Name, 544 Step: f.step.Name,
545 URL: f.URL(),
461 }) 546 })
462 } 547 }
463 548
464 alr.Extension = bf 549 alr.Extension = bf
465 if len(bf.Reasons) == 0 { 550 if len(bf.Reasons) == 0 {
466 » » » » log.Warnf("No reasons for step failure: %s", ale rtKey(f.masterName, f.builderName, f.step.Name, "")) 551 » » » » alr.Key = alertKey(f.masterName, f.builderName, f.step.Name, "")
467 » » » » rs <- res{ 552 » » » » log.Warnf("No reasons for step failure: %s", alr .Key)
468 » » » » » f: f,
469 » » » » }
470 } else { 553 } else {
471 // Should the key include all of the reasons? 554 // Should the key include all of the reasons?
472 alr.Key = alertKey(f.masterName, f.builderName, f.step.Name, reasons[0]) 555 alr.Key = alertKey(f.masterName, f.builderName, f.step.Name, reasons[0])
473 556 » » » }
474 » » » » rs <- res{ 557 » » » rs <- res{
475 » » » » » f: f, 558 » » » » f: f,
476 » » » » » a: &alr, 559 » » » » a: &alr,
477 » » » » » err: nil, 560 » » » » err: nil,
478 » » » » }
479 } 561 }
480 }(f) 562 }(f)
481 } 563 }
482 564
483 » for _ = range failures { 565 » for _ = range scannedFailures {
484 r := <-rs 566 r := <-rs
485 if r.a != nil { 567 if r.a != nil {
486 ret = append(ret, *r.a) 568 ret = append(ret, *r.a)
487 } 569 }
488 } 570 }
489 571
490 return ret, nil 572 return ret, nil
491 } 573 }
492 574
493 // reasonsForFailure examines the step failure and applies some heuristics to 575 // reasonsForFailure examines the step failure and applies some heuristics to
494 // to find the cause. It may make blocking IO calls in the process. 576 // to find the cause. It may make blocking IO calls in the process.
495 func (a *MasterAnalyzer) reasonsForFailure(f stepFailure) []string { 577 func (a *MasterAnalyzer) reasonsForFailure(f stepFailure) []string {
496 ret := []string{} 578 ret := []string{}
497 recognized := false 579 recognized := false
498 log.Infof("Checking for reasons for failure step: %v", f.step.Name) 580 log.Infof("Checking for reasons for failure step: %v", f.step.Name)
499 for _, sfa := range a.StepAnalyzers { 581 for _, sfa := range a.StepAnalyzers {
500 res, err := sfa.Analyze(f) 582 res, err := sfa.Analyze(f)
501 if err != nil { 583 if err != nil {
584 // TODO: return something that contains errors *and* rea sons.
502 log.Errorf("Error get reasons from StepAnalyzer: %v", er r) 585 log.Errorf("Error get reasons from StepAnalyzer: %v", er r)
586 continue
503 } 587 }
504 if res.Recognized { 588 if res.Recognized {
505 recognized = true 589 recognized = true
506 ret = append(ret, res.Reasons...) 590 ret = append(ret, res.Reasons...)
507 } 591 }
508 } 592 }
509 593
510 if !recognized { 594 if !recognized {
511 // TODO: log and report frequently encountered unrecognized buil der step 595 // TODO: log and report frequently encountered unrecognized buil der step
512 // failure names. 596 // failure names.
(...skipping 39 matching lines...) Expand 10 before | Expand all | Expand 10 after
552 // Sigh. build.chromium.org doesn't accept + as an escaped space in URL paths. 636 // Sigh. build.chromium.org doesn't accept + as an escaped space in URL paths.
553 func oldEscape(s string) string { 637 func oldEscape(s string) string {
554 return strings.Replace(url.QueryEscape(s), "+", "%20", -1) 638 return strings.Replace(url.QueryEscape(s), "+", "%20", -1)
555 } 639 }
556 640
557 // URL returns a url to builder step failure page. 641 // URL returns a url to builder step failure page.
558 func (f stepFailure) URL() string { 642 func (f stepFailure) URL() string {
559 return fmt.Sprintf("https://build.chromium.org/p/%s/builders/%s/builds/% d/steps/%s", 643 return fmt.Sprintf("https://build.chromium.org/p/%s/builders/%s/builds/% d/steps/%s",
560 f.masterName, oldEscape(f.builderName), f.build.Number, oldEscap e(f.step.Name)) 644 f.masterName, oldEscape(f.builderName), f.build.Number, oldEscap e(f.step.Name))
561 } 645 }
OLDNEW
« no previous file with comments | « no previous file | go/src/infra/monitoring/analyzer/analyzer_test.go » ('j') | go/src/infra/monitoring/messages/alerts.go » ('J')

Powered by Google App Engine
This is Rietveld 408576698