Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(143)

Side by Side Diff: go/src/infra/monitoring/analyzer/analyzer.go

Issue 1125263004: dispatcher: fix test result parsing, build ranges for failure alerts, other fixes (Closed) Base URL: https://chromium.googlesource.com/infra/infra.git@master
Patch Set: fixed comments Created 5 years, 7 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 // Copyright 2015 The Chromium Authors. All rights reserved. 1 // Copyright 2015 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 package analyzer 5 package analyzer
6 6
7 import ( 7 import (
8 "errors" 8 "errors"
9 "fmt" 9 "fmt"
10 "net/url" 10 "net/url"
11 "path/filepath" 11 "path/filepath"
12 "sort" 12 "sort"
13 "strings" 13 "strings"
14 "sync"
14 "time" 15 "time"
15 16
16 "github.com/Sirupsen/logrus" 17 "github.com/Sirupsen/logrus"
17 18
18 "infra/monitoring/client" 19 "infra/monitoring/client"
19 "infra/monitoring/messages" 20 "infra/monitoring/messages"
20 ) 21 )
21 22
22 const ( 23 const (
23 // StepCompletedRun is a synthetic step name used to indicate the build run is complete. 24 // StepCompletedRun is a synthetic step name used to indicate the build run is complete.
24 » StepCompletedRun = "completed run" 25 » StepCompletedRun = "completed run"
26 » treeCloserPri = 0
27 » staleMasterSev = 0
28 » staleBuilderSev = 0
29 » hungBuilderSev = 1
30 » idleBuilderSev = 1
31 » offlineBuilderSev = 1
25 ) 32 )
26 33
27 var ( 34 var (
28 log = logrus.New() 35 log = logrus.New()
29 ) 36 )
30 37
31 var ( 38 var (
32 errNoBuildSteps = errors.New("No build steps") 39 errNoBuildSteps = errors.New("No build steps")
33 ) 40 )
34 41
(...skipping 14 matching lines...) Expand all
49 // Reasons lists the reasons for the stepFailure determined by the StepF ailureAnalyzer. 56 // Reasons lists the reasons for the stepFailure determined by the StepF ailureAnalyzer.
50 Reasons []string 57 Reasons []string
51 } 58 }
52 59
53 // MasterAnalyzer runs the process of checking masters, builders, test results a nd so on, 60 // MasterAnalyzer runs the process of checking masters, builders, test results a nd so on,
54 // in order to produce alerts. 61 // in order to produce alerts.
55 type MasterAnalyzer struct { 62 type MasterAnalyzer struct {
56 // MaxRecentBuilds is the maximum number of recent builds to check, per builder. 63 // MaxRecentBuilds is the maximum number of recent builds to check, per builder.
57 MaxRecentBuilds int 64 MaxRecentBuilds int
58 65
66 // MinRecentBuilds is the minimum number of recent builds to check, per builder.
67 MinRecentBuilds int
68
59 // StepAnalzers are the set of build step failure analyzers to be checke d on 69 // StepAnalzers are the set of build step failure analyzers to be checke d on
60 // build step failures. 70 // build step failures.
61 StepAnalyzers []StepAnalyzer 71 StepAnalyzers []StepAnalyzer
62 72
63 // Client is the Client implementation for fetching json from CBE, build s, etc. 73 // Client is the Client implementation for fetching json from CBE, build s, etc.
64 Client client.Client 74 Client client.Client
65 75
66 // HungBuilerThresh is the maxumum length of time a builder may be in st ate "building" 76 // HungBuilerThresh is the maxumum length of time a builder may be in st ate "building"
67 // before triggering a "hung builder" alert. 77 // before triggering a "hung builder" alert.
68 HungBuilderThresh time.Duration 78 HungBuilderThresh time.Duration
69 79
70 // OfflineBuilderThresh is the maximum length of time a builder may be i n state "offline" 80 // OfflineBuilderThresh is the maximum length of time a builder may be i n state "offline"
71 // before triggering an "offline builder" alert. 81 // before triggering an "offline builder" alert.
72 OfflineBuilderThresh time.Duration 82 OfflineBuilderThresh time.Duration
73 83
74 // IdleBuilderCountThresh is the maximum number of builds a builder may have in queue 84 // IdleBuilderCountThresh is the maximum number of builds a builder may have in queue
75 // while in the "idle" state before triggering an "idle builder" alert. 85 // while in the "idle" state before triggering an "idle builder" alert.
76 IdleBuilderCountThresh int64 86 IdleBuilderCountThresh int64
77 87
78 // StaleMasterThreshold is the maximum age that master data from CBE can be before 88 // StaleMasterThreshold is the maximum age that master data from CBE can be before
79 // triggering a "stale master" alert. 89 // triggering a "stale master" alert.
80 StaleMasterThreshold time.Duration 90 StaleMasterThreshold time.Duration
81 91
82 // bCache is a map of build cache key to Build message. 92 // bCache is a map of build cache key to Build message.
83 bCache map[string]*messages.Builds 93 bCache map[string]*messages.Builds
94 // bLock protects bCache
95 bLock *sync.Mutex
96
97 // These limit the scope analysis, useful for debugging.
98 TreeOnly string
99 MasterOnly string
100 BuilderOnly string
101 BuildOnly int64
84 102
85 // now is useful for mocking the system clock in testing. 103 // now is useful for mocking the system clock in testing.
86 now func() time.Time 104 now func() time.Time
87 } 105 }
88 106
89 // New returns a new Analyzer. If client is nil, it assigns a default implementa tion. 107 // New returns a new Analyzer. If client is nil, it assigns a default implementa tion.
90 // maxBuilds is the maximum number of builds to check, per builder. 108 // maxBuilds is the maximum number of builds to check, per builder.
91 func New(c client.Client, maxBuilds int) *MasterAnalyzer { 109 func New(c client.Client, minBuilds, maxBuilds int) *MasterAnalyzer {
92 if c == nil { 110 if c == nil {
93 c = client.New() 111 c = client.New()
94 } 112 }
95 113
96 return &MasterAnalyzer{ 114 return &MasterAnalyzer{
97 Client: c, 115 Client: c,
98 MaxRecentBuilds: maxBuilds, 116 MaxRecentBuilds: maxBuilds,
117 MinRecentBuilds: minBuilds,
99 HungBuilderThresh: 3 * time.Hour, 118 HungBuilderThresh: 3 * time.Hour,
100 OfflineBuilderThresh: 90 * time.Minute, 119 OfflineBuilderThresh: 90 * time.Minute,
101 IdleBuilderCountThresh: 50, 120 IdleBuilderCountThresh: 50,
102 StaleMasterThreshold: 10 * time.Minute, 121 StaleMasterThreshold: 10 * time.Minute,
103 StepAnalyzers: []StepAnalyzer{ 122 StepAnalyzers: []StepAnalyzer{
104 &TestFailureAnalyzer{Client: c}, 123 &TestFailureAnalyzer{Client: c},
124 &CompileFailureAnalyzer{Client: c},
105 }, 125 },
106 126
107 now: func() time.Time { 127 now: func() time.Time {
108 return time.Now() 128 return time.Now()
109 }, 129 },
110 130
111 bCache: map[string]*messages.Builds{}, 131 bCache: map[string]*messages.Builds{},
132 bLock: &sync.Mutex{},
112 } 133 }
113 } 134 }
114 135
115 // MasterAlerts returns alerts generated from the master at URL. 136 // MasterAlerts returns alerts generated from the master at URL.
116 func (a *MasterAnalyzer) MasterAlerts(url string, be *messages.BuildExtract) []m essages.Alert { 137 func (a *MasterAnalyzer) MasterAlerts(url string, be *messages.BuildExtract) []m essages.Alert {
117 ret := []messages.Alert{} 138 ret := []messages.Alert{}
118 139
119 // Copied logic from builder_messages. 140 // Copied logic from builder_messages.
120 // No created_timestamp should be a warning sign, no? 141 // No created_timestamp should be a warning sign, no?
121 if be.CreatedTimestamp == messages.EpochTime(0) { 142 if be.CreatedTimestamp == messages.EpochTime(0) {
122 return ret 143 return ret
123 } 144 }
124 145
125 elapsed := a.now().Sub(be.CreatedTimestamp.Time()) 146 elapsed := a.now().Sub(be.CreatedTimestamp.Time())
126 if elapsed > a.StaleMasterThreshold { 147 if elapsed > a.StaleMasterThreshold {
127 ret = append(ret, messages.Alert{ 148 ret = append(ret, messages.Alert{
128 Key: fmt.Sprintf("stale master: %v", url), 149 Key: fmt.Sprintf("stale master: %v", url),
129 Title: "Stale Master Data", 150 Title: "Stale Master Data",
130 Body: fmt.Sprintf("%s elapsed since last update (%s) .", elapsed, be.CreatedTimestamp.Time()), 151 Body: fmt.Sprintf("%s elapsed since last update (%s) .", elapsed, be.CreatedTimestamp.Time()),
131 » » » Severity: 0, 152 » » » Severity: staleMasterSev,
132 Time: messages.TimeToEpochTime(a.now()), 153 Time: messages.TimeToEpochTime(a.now()),
133 Links: []messages.Link{{"Master", url}}, 154 Links: []messages.Link{{"Master", url}},
134 // No type or extension for now. 155 // No type or extension for now.
135 }) 156 })
136 } 157 }
137 if elapsed < 0 { 158 if elapsed < 0 {
138 // Add this to the alerts returned, rather than just log it? 159 // Add this to the alerts returned, rather than just log it?
139 log.Errorf("Master %s timestamp is newer than current time (%s): %s old.", url, a.now(), elapsed) 160 log.Errorf("Master %s timestamp is newer than current time (%s): %s old.", url, a.now(), elapsed)
140 } 161 }
141 162
(...skipping 11 matching lines...) Expand all
153 type r struct { 174 type r struct {
154 bn string 175 bn string
155 b messages.Builders 176 b messages.Builders
156 alerts []messages.Alert 177 alerts []messages.Alert
157 err []error 178 err []error
158 } 179 }
159 c := make(chan r, len(be.Builders)) 180 c := make(chan r, len(be.Builders))
160 181
161 // TODO: get a list of all the running builds from be.Slaves? It 182 // TODO: get a list of all the running builds from be.Slaves? It
162 // appears to be used later on in the original py. 183 // appears to be used later on in the original py.
184 scannedBuilders := []string{}
163 for bn, b := range be.Builders { 185 for bn, b := range be.Builders {
186 if a.BuilderOnly != "" && bn != a.BuilderOnly {
187 continue
188 }
189 scannedBuilders = append(scannedBuilders, bn)
164 go func(bn string, b messages.Builders) { 190 go func(bn string, b messages.Builders) {
165 out := r{bn: bn, b: b} 191 out := r{bn: bn, b: b}
166 defer func() { 192 defer func() {
167 c <- out 193 c <- out
168 }() 194 }()
169 195
170 // This blocks on IO, hence the goroutine. 196 // This blocks on IO, hence the goroutine.
171 a.warmBuildCache(mn, bn, b.CachedBuilds) 197 a.warmBuildCache(mn, bn, b.CachedBuilds)
172
173 // Each call to builderAlerts may trigger blocking json fetches, 198 // Each call to builderAlerts may trigger blocking json fetches,
174 // but it has a data dependency on the above cache-warmi ng call, so 199 // but it has a data dependency on the above cache-warmi ng call, so
175 // the logic remains serial. 200 // the logic remains serial.
176 out.alerts, out.err = a.builderAlerts(mn, bn, &b) 201 out.alerts, out.err = a.builderAlerts(mn, bn, &b)
177 }(bn, b) 202 }(bn, b)
178 } 203 }
179 204
180 ret := []messages.Alert{} 205 ret := []messages.Alert{}
181 » for bn := range be.Builders { 206 » for _, bn := range scannedBuilders {
182 r := <-c 207 r := <-c
183 if len(r.err) != 0 { 208 if len(r.err) != 0 {
184 // TODO: add a special alert for this too? 209 // TODO: add a special alert for this too?
185 log.Errorf("Error getting alerts for builder %s: %v", bn , r.err) 210 log.Errorf("Error getting alerts for builder %s: %v", bn , r.err)
186 } else { 211 } else {
187 ret = append(ret, r.alerts...) 212 ret = append(ret, r.alerts...)
188 } 213 }
189 } 214 }
190 215
191 return ret 216 return ret
192 } 217 }
193 218
194 // masterName extracts the name of the master from the master's URL. 219 // masterName extracts the name of the master from the master's URL.
195 func masterName(URL string) (string, error) { 220 func masterName(URL string) (string, error) {
196 mURL, err := url.Parse(URL) 221 mURL, err := url.Parse(URL)
197 if err != nil { 222 if err != nil {
198 return "", err 223 return "", err
199 } 224 }
200 pathParts := strings.Split(mURL.Path, "/") 225 pathParts := strings.Split(mURL.Path, "/")
201 » return pathParts[len(pathParts)-1], nil 226 » return pathParts[len(pathParts)-2], nil
Vadim Sh. 2015/05/06 21:40:18 check that len(pathParts) is >= 2. Also it would b
seanmccullough 2015/05/06 22:16:20 To lop the /json off the http://.../{master.name}/
202 } 227 }
203 228
204 func cacheKeyForBuild(master, builder string, number int64) string { 229 func cacheKeyForBuild(master, builder string, number int64) string {
205 return filepath.FromSlash( 230 return filepath.FromSlash(
206 fmt.Sprintf("%s/%s/%d.json", url.QueryEscape(master), url.QueryE scape(builder), number)) 231 fmt.Sprintf("%s/%s/%d.json", url.QueryEscape(master), url.QueryE scape(builder), number))
207 } 232 }
208 233
209 // TODO: actually write the on-disk cache. 234 // TODO: actually write the on-disk cache.
210 func filenameForCacheKey(cc string) string { 235 func filenameForCacheKey(cc string) string {
211 cc = strings.Replace(cc, "/", "_", -1) 236 cc = strings.Replace(cc, "/", "_", -1)
(...skipping 10 matching lines...) Expand all
222 v.Add("builder", builder) 247 v.Add("builder", builder)
223 248
224 URL := fmt.Sprintf("https://chrome-build-extract.appspot.com/get_builds? %s", v.Encode()) 249 URL := fmt.Sprintf("https://chrome-build-extract.appspot.com/get_builds? %s", v.Encode())
225 res := struct { 250 res := struct {
226 Builds []messages.Builds `json:"builds"` 251 Builds []messages.Builds `json:"builds"`
227 }{} 252 }{}
228 253
229 // TODO: add FetchBuilds to the client interface. Take a list of {master , builder} and 254 // TODO: add FetchBuilds to the client interface. Take a list of {master , builder} and
230 // return (map[{master, builder}][]Builds, map [{master, builder}]error) 255 // return (map[{master, builder}][]Builds, map [{master, builder}]error)
231 // That way we can do all of these in parallel. 256 // That way we can do all of these in parallel.
257
232 status, err := a.Client.JSON(URL, &res) 258 status, err := a.Client.JSON(URL, &res)
233 if err != nil { 259 if err != nil {
234 log.Errorf("Error (%d) fetching %s: %s", status, URL, err) 260 log.Errorf("Error (%d) fetching %s: %s", status, URL, err)
235 } 261 }
236 262
263 a.bLock.Lock()
237 for _, b := range res.Builds { 264 for _, b := range res.Builds {
238 » » a.bCache[cacheKeyForBuild(master, builder, b.Number)] = &b 265 » » // TODO: consider making res.Builds be []*messages.Builds instea d of []messages.Builds
266 » » ba := b
Vadim Sh. 2015/05/06 21:40:18 IIUC, b is already a copy of whatever is in messag
seanmccullough 2015/05/06 22:16:20 Taking the address of a range var always gives you
Vadim Sh. 2015/05/06 22:30:23 :) Good to know. Go pointers + garbage collection
seanmccullough 2015/05/06 22:43:13 Thankfully there aren't quite as many wtf moments
267 » » a.bCache[cacheKeyForBuild(master, builder, b.Number)] = &ba
239 } 268 }
269 a.bLock.Unlock()
240 } 270 }
241 271
242 // This type is used for sorting build IDs. 272 // This type is used for sorting build IDs.
243 type buildIDs []int64 273 type buildIDs []int64
244 274
245 func (a buildIDs) Len() int { return len(a) } 275 func (a buildIDs) Len() int { return len(a) }
246 func (a buildIDs) Swap(i, j int) { a[i], a[j] = a[j], a[i] } 276 func (a buildIDs) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
247 func (a buildIDs) Less(i, j int) bool { return a[i] > a[j] } 277 func (a buildIDs) Less(i, j int) bool { return a[i] > a[j] }
248 278
249 // latestBuildStep returns the latest build step name and update time, and an er ror 279 // latestBuildStep returns the latest build step name and update time, and an er ror
(...skipping 33 matching lines...) Expand 10 before | Expand all | Expand 10 after
283 recentBuildIDs = recentBuildIDs[:a.MaxRecentBuilds] 313 recentBuildIDs = recentBuildIDs[:a.MaxRecentBuilds]
284 } 314 }
285 if len(recentBuildIDs) == 0 { 315 if len(recentBuildIDs) == 0 {
286 // TODO: Make an alert for this? 316 // TODO: Make an alert for this?
287 log.Errorf("No recent builds for %s.%s", mn, bn) 317 log.Errorf("No recent builds for %s.%s", mn, bn)
288 return alerts, errs 318 return alerts, errs
289 } 319 }
290 log.Infof("Checking %d most recent builds for alertable step failures: % s/%s", len(recentBuildIDs), mn, bn) 320 log.Infof("Checking %d most recent builds for alertable step failures: % s/%s", len(recentBuildIDs), mn, bn)
291 321
292 // Check for alertable step failures. 322 // Check for alertable step failures.
293 » for _, buildID := range recentBuildIDs { 323 » stepFailureAlerts := map[string]messages.Alert{}
324 » firstFailureInstance := map[string]int64{}
325
326 » for i, buildID := range recentBuildIDs {
294 failures, err := a.stepFailures(mn, bn, buildID) 327 failures, err := a.stepFailures(mn, bn, buildID)
295 if err != nil { 328 if err != nil {
296 errs = append(errs, err) 329 errs = append(errs, err)
297 } 330 }
331 if len(failures) == 0 && i > a.MinRecentBuilds {
332 // Bail as soon as we find a successful build prior to t he most recent couple of builds.
333 break
334 }
335
298 as, err := a.stepFailureAlerts(failures) 336 as, err := a.stepFailureAlerts(failures)
299 if err != nil { 337 if err != nil {
300 errs = append(errs, err) 338 errs = append(errs, err)
301 } 339 }
302 » » alerts = append(alerts, as...) 340
341 » » lastKey := ""
342 » » // This probably isn't exactly what we want since multiple build ers may have the same alert firing.
343 » » // This just merges build ranges on a per-builder basis.
344 » » for _, alr := range as {
345 » » » // Only keep the most recent alert. Since recentBuildID s is
346 » » » // sorted newest build first, ignore older instances of the alert.
347 » » » if _, ok := stepFailureAlerts[alr.Key]; !ok {
348 » » » » stepFailureAlerts[alr.Key] = alr
349 » » » }
350 » » » if alr.Key == lastKey || lastKey == "" {
351 » » » » firstFailureInstance[alr.Key] = alr.Extension.(m essages.BuildFailure).Builders[0].FirstFailure
Vadim Sh. 2015/05/06 21:40:18 This type assertion looks rather magical... Perhap
seanmccullough 2015/05/06 22:16:20 Added a check and log an error if it fails. Not su
352 » » » }
353 » » » lastKey = alr.Key
354 » » }
303 } 355 }
304 356
305 » // Check for stale builders. Latest build is the first in the list. 357 » for _, alr := range stepFailureAlerts {
358 » » alr.Extension.(messages.BuildFailure).Builders[0].FirstFailure = firstFailureInstance[alr.Key]
359 » » alerts = append(alerts, alr)
360 » }
361
362 » // Check for stale/idle/offline builders. Latest build is the first in the list.
306 lastBuildID := recentBuildIDs[0] 363 lastBuildID := recentBuildIDs[0]
307 log.Infof("Checking last build ID: %d", lastBuildID) 364 log.Infof("Checking last build ID: %d", lastBuildID)
365
308 // TODO: get this from cache. 366 // TODO: get this from cache.
309 » lastBuild, err := a.Client.Build(mn, bn, lastBuildID) 367 » var lastBuild *messages.Builds
310 » if err != nil { 368 » a.bLock.Lock()
311 » » errs = append(errs, fmt.Errorf("Couldn't get latest build %d for %s.%s: %s", lastBuildID, mn, bn, err)) 369 » lastBuild = a.bCache[cacheKeyForBuild(mn, bn, lastBuildID)]
Vadim Sh. 2015/05/06 21:40:18 := and remove var above
seanmccullough 2015/05/06 22:16:20 Done.
312 » » return alerts, errs 370 » a.bLock.Unlock()
371 » if lastBuild == nil {
372 » » var err error
373 » » lastBuild, err = a.Client.Build(mn, bn, lastBuildID)
374 » » if err != nil {
375 » » » errs = append(errs, fmt.Errorf("Couldn't get latest buil d %d for %s.%s: %s", lastBuildID, mn, bn, err))
376 » » » return alerts, errs
377 » » }
313 } 378 }
314 379
315 // Examining only the latest build is probably suboptimal since if it's still in progress it might 380 // Examining only the latest build is probably suboptimal since if it's still in progress it might
316 // not have hit a step that is going to fail and has failed repeatedly f or the last few builds. 381 // not have hit a step that is going to fail and has failed repeatedly f or the last few builds.
317 // AKA "Reliable failures". TODO: Identify "Reliable failures" 382 // AKA "Reliable failures". TODO: Identify "Reliable failures"
318 lastStep, lastUpdated, err := a.latestBuildStep(lastBuild) 383 lastStep, lastUpdated, err := a.latestBuildStep(lastBuild)
319 if err != nil { 384 if err != nil {
320 errs = append(errs, fmt.Errorf("Couldn't get latest build step f or %s.%s: %v", mn, bn, err)) 385 errs = append(errs, fmt.Errorf("Couldn't get latest build step f or %s.%s: %v", mn, bn, err))
321 return alerts, errs 386 return alerts, errs
322 387
323 } 388 }
324 elapsed := a.now().Sub(lastUpdated.Time()) 389 elapsed := a.now().Sub(lastUpdated.Time())
325 links := []messages.Link{ 390 links := []messages.Link{
326 {"Builder", fmt.Sprintf("https://build.chromium.org/p/%s/builder s/%s", mn, bn)}, 391 {"Builder", fmt.Sprintf("https://build.chromium.org/p/%s/builder s/%s", mn, bn)},
327 {"Last build", fmt.Sprintf("https://build.chromium.org/p/%s/buil ders/%s/builds/%d", mn, bn, lastBuildID)}, 392 {"Last build", fmt.Sprintf("https://build.chromium.org/p/%s/buil ders/%s/builds/%d", mn, bn, lastBuildID)},
328 {"Last build step", fmt.Sprintf("https://build.chromium.org/p/%s /builders/%s/builds/%d/steps/%s", mn, bn, lastBuildID, lastStep)}, 393 {"Last build step", fmt.Sprintf("https://build.chromium.org/p/%s /builders/%s/builds/%d/steps/%s", mn, bn, lastBuildID, lastStep)},
329 } 394 }
330 395
331 switch b.State { 396 switch b.State {
332 case messages.StateBuilding: 397 case messages.StateBuilding:
333 if elapsed > a.HungBuilderThresh && lastStep != StepCompletedRun { 398 if elapsed > a.HungBuilderThresh && lastStep != StepCompletedRun {
334 alerts = append(alerts, messages.Alert{ 399 alerts = append(alerts, messages.Alert{
335 Key: fmt.Sprintf("%s.%s.hung", mn, bn), 400 Key: fmt.Sprintf("%s.%s.hung", mn, bn),
336 Title: fmt.Sprintf("%s.%s is hung in step %s. ", mn, bn, lastStep), 401 Title: fmt.Sprintf("%s.%s is hung in step %s. ", mn, bn, lastStep),
337 Body: fmt.Sprintf("%s.%s has been building f or %v (last step update %s), past the alerting threshold of %v", mn, bn, elapsed , lastUpdated.Time(), a.HungBuilderThresh), 402 Body: fmt.Sprintf("%s.%s has been building f or %v (last step update %s), past the alerting threshold of %v", mn, bn, elapsed , lastUpdated.Time(), a.HungBuilderThresh),
338 » » » » Severity: 0, 403 » » » » Severity: hungBuilderSev,
339 Time: messages.TimeToEpochTime(a.now()), 404 Time: messages.TimeToEpochTime(a.now()),
340 Links: links, 405 Links: links,
341 }) 406 })
342 // Note, just because it's building doesn't mean it's in a good state. If the last N builds 407 // Note, just because it's building doesn't mean it's in a good state. If the last N builds
343 // all failed (for some large N) then this might still b e alertable. 408 // all failed (for some large N) then this might still b e alertable.
344 } 409 }
345 case messages.StateOffline: 410 case messages.StateOffline:
346 if elapsed > a.OfflineBuilderThresh { 411 if elapsed > a.OfflineBuilderThresh {
347 alerts = append(alerts, messages.Alert{ 412 alerts = append(alerts, messages.Alert{
348 Key: fmt.Sprintf("%s.%s.offline", mn, bn), 413 Key: fmt.Sprintf("%s.%s.offline", mn, bn),
349 Title: fmt.Sprintf("%s.%s is offline.", mn, b n), 414 Title: fmt.Sprintf("%s.%s is offline.", mn, b n),
350 » » » » Body: fmt.Sprintf("%s.%s has been offline fo r %v (last step update %s), past the alerting threshold of %v", mn, bn, elapsed, lastUpdated.Time(), a.OfflineBuilderThresh), 415 » » » » Body: fmt.Sprintf("%s.%s has been offline fo r %v (last step update %s %v), past the alerting threshold of %v", mn, bn, elaps ed, lastUpdated.Time(), float64(lastUpdated), a.OfflineBuilderThresh),
351 » » » » Severity: 0, 416 » » » » Severity: offlineBuilderSev,
352 Time: messages.TimeToEpochTime(a.now()), 417 Time: messages.TimeToEpochTime(a.now()),
353 Links: links, 418 Links: links,
354 }) 419 })
355 } 420 }
356 case messages.StateIdle: 421 case messages.StateIdle:
357 if b.PendingBuilds > a.IdleBuilderCountThresh { 422 if b.PendingBuilds > a.IdleBuilderCountThresh {
358 alerts = append(alerts, messages.Alert{ 423 alerts = append(alerts, messages.Alert{
359 Key: fmt.Sprintf("%s.%s.idle", mn, bn), 424 Key: fmt.Sprintf("%s.%s.idle", mn, bn),
360 Title: fmt.Sprintf("%s.%s is idle with too ma ny pending builds.", mn, bn), 425 Title: fmt.Sprintf("%s.%s is idle with too ma ny pending builds.", mn, bn),
361 Body: fmt.Sprintf("%s.%s is idle with %d pen ding builds, past the alerting threshold of %d", mn, bn, b.PendingBuilds, a.Idle BuilderCountThresh), 426 Body: fmt.Sprintf("%s.%s is idle with %d pen ding builds, past the alerting threshold of %d", mn, bn, b.PendingBuilds, a.Idle BuilderCountThresh),
362 » » » » Severity: 0, 427 » » » » Severity: idleBuilderSev,
363 Time: messages.TimeToEpochTime(a.now()), 428 Time: messages.TimeToEpochTime(a.now()),
364 Links: links, 429 Links: links,
365 }) 430 })
366 } 431 }
367 default: 432 default:
368 log.Errorf("Unknown %s.%s builder state: %s", mn, bn, b.State) 433 log.Errorf("Unknown %s.%s builder state: %s", mn, bn, b.State)
369 } 434 }
370 435
371 return alerts, errs 436 return alerts, errs
372 } 437 }
373 438
374 // stepFailures returns the steps that have failed recently on builder bn. 439 // stepFailures returns the steps that have failed recently on builder bn.
375 func (a *MasterAnalyzer) stepFailures(mn string, bn string, bID int64) ([]stepFa ilure, error) { 440 func (a *MasterAnalyzer) stepFailures(mn string, bn string, bID int64) ([]stepFa ilure, error) {
376 cc := cacheKeyForBuild(mn, bn, bID) 441 cc := cacheKeyForBuild(mn, bn, bID)
377 442
378 var err error // To avoid re-scoping b in the nested conditional below w ith a :=. 443 var err error // To avoid re-scoping b in the nested conditional below w ith a :=.
444 a.bLock.Lock()
379 b, ok := a.bCache[cc] 445 b, ok := a.bCache[cc]
446 a.bLock.Unlock()
380 if !ok { 447 if !ok {
381 log.Infof("Cache miss for %s", cc) 448 log.Infof("Cache miss for %s", cc)
382 b, err = a.Client.Build(mn, bn, bID) 449 b, err = a.Client.Build(mn, bn, bID)
383 if err != nil || b == nil { 450 if err != nil || b == nil {
384 log.Errorf("Error fetching build: %v", err) 451 log.Errorf("Error fetching build: %v", err)
385 return nil, err 452 return nil, err
386 } 453 }
387 } 454 }
388 455
389 ret := []stepFailure{} 456 ret := []stepFailure{}
457
390 for _, s := range b.Steps { 458 for _, s := range b.Steps {
391 if !s.IsFinished || len(s.Results) == 0 { 459 if !s.IsFinished || len(s.Results) == 0 {
392 continue 460 continue
393 } 461 }
394 // Because Results in the json data is a homogeneous array, the unmarshaler 462 // Because Results in the json data is a homogeneous array, the unmarshaler
395 // doesn't have any type information to assert about it. We have to do 463 // doesn't have any type information to assert about it. We have to do
396 // some ugly runtime type assertion ourselves. 464 // some ugly runtime type assertion ourselves.
397 if r, ok := s.Results[0].(float64); ok { 465 if r, ok := s.Results[0].(float64); ok {
398 if r == 0 || r == 1 { 466 if r == 0 || r == 1 {
399 // This 0/1 check seems to be a convention or he uristic. A 0 or 1 467 // This 0/1 check seems to be a convention or he uristic. A 0 or 1
(...skipping 22 matching lines...) Expand all
422 ret := []messages.Alert{} 490 ret := []messages.Alert{}
423 type res struct { 491 type res struct {
424 f stepFailure 492 f stepFailure
425 a *messages.Alert 493 a *messages.Alert
426 err error 494 err error
427 } 495 }
428 496
429 // Might not need full capacity buffer, since some failures are ignored below. 497 // Might not need full capacity buffer, since some failures are ignored below.
430 rs := make(chan res, len(failures)) 498 rs := make(chan res, len(failures))
431 499
500 scannedFailures := []stepFailure{}
432 for _, f := range failures { 501 for _, f := range failures {
433 // goroutine/channel because the reasonsForFailure call potentia lly 502 // goroutine/channel because the reasonsForFailure call potentia lly
434 // blocks on IO. 503 // blocks on IO.
504 if f.step.Name == "steps" {
505 continue
506 // The actual breaking step will appear later.
507 }
508 scannedFailures = append(scannedFailures, f)
435 go func(f stepFailure) { 509 go func(f stepFailure) {
436 alr := messages.Alert{ 510 alr := messages.Alert{
437 Title: fmt.Sprintf("Builder step failure: %s.%s" , f.masterName, f.builderName), 511 Title: fmt.Sprintf("Builder step failure: %s.%s" , f.masterName, f.builderName),
438 Time: messages.EpochTime(a.now().Unix()), 512 Time: messages.EpochTime(a.now().Unix()),
439 Type: "buildfailure", 513 Type: "buildfailure",
440 } 514 }
441 515
516 // If the builder has been failing on the same step for multiple builds in a row,
517 // we should have only one alert but indicate the range of builds affected.
518 // These are set in FirstFailure and LastFailure.
442 bf := messages.BuildFailure{ 519 bf := messages.BuildFailure{
443 // FIXME: group builders? 520 // FIXME: group builders?
444 Builders: []messages.AlertedBuilder{ 521 Builders: []messages.AlertedBuilder{
445 { 522 {
446 Name: f.builderName, 523 Name: f.builderName,
447 » » » » » » URL: f.URL(), 524 » » » » » » URL: fmt.Sprintf("http s://build.chromium.org/p/%s/builders/%s", f.masterName, f.builderName),
448 » » » » » » FirstFailure: 0, 525 » » » » » » FirstFailure: f.build.Number,
449 » » » » » » LatestFailure: 1, 526 » » » » » » LatestFailure: f.build.Number,
450 }, 527 },
451 }, 528 },
452 // TODO: RegressionRanges: 529 // TODO: RegressionRanges:
453 // look into Builds.SourceStamp.Changes. 530 // look into Builds.SourceStamp.Changes.
454 } 531 }
455 532
456 reasons := a.reasonsForFailure(f) 533 reasons := a.reasonsForFailure(f)
457 for _, r := range reasons { 534 for _, r := range reasons {
458 bf.Reasons = append(bf.Reasons, messages.Reason{ 535 bf.Reasons = append(bf.Reasons, messages.Reason{
459 TestName: r, 536 TestName: r,
460 Step: f.step.Name, 537 Step: f.step.Name,
538 URL: f.URL(),
461 }) 539 })
462 } 540 }
463 541
464 alr.Extension = bf 542 alr.Extension = bf
465 if len(bf.Reasons) == 0 { 543 if len(bf.Reasons) == 0 {
466 » » » » log.Warnf("No reasons for step failure: %s", ale rtKey(f.masterName, f.builderName, f.step.Name, "")) 544 » » » » alr.Key = alertKey(f.masterName, f.builderName, f.step.Name, "")
467 » » » » rs <- res{ 545 » » » » log.Warnf("No reasons for step failure: %s", alr .Key)
468 » » » » » f: f,
469 » » » » }
470 } else { 546 } else {
471 // Should the key include all of the reasons? 547 // Should the key include all of the reasons?
472 alr.Key = alertKey(f.masterName, f.builderName, f.step.Name, reasons[0]) 548 alr.Key = alertKey(f.masterName, f.builderName, f.step.Name, reasons[0])
473 549 » » » }
474 » » » » rs <- res{ 550 » » » rs <- res{
475 » » » » » f: f, 551 » » » » f: f,
476 » » » » » a: &alr, 552 » » » » a: &alr,
477 » » » » » err: nil, 553 » » » » err: nil,
478 » » » » }
479 } 554 }
480 }(f) 555 }(f)
481 } 556 }
482 557
483 » for _ = range failures { 558 » for _ = range scannedFailures {
484 r := <-rs 559 r := <-rs
485 if r.a != nil { 560 if r.a != nil {
486 ret = append(ret, *r.a) 561 ret = append(ret, *r.a)
487 } 562 }
488 } 563 }
489 564
490 return ret, nil 565 return ret, nil
491 } 566 }
492 567
493 // reasonsForFailure examines the step failure and applies some heuristics to 568 // reasonsForFailure examines the step failure and applies some heuristics to
494 // to find the cause. It may make blocking IO calls in the process. 569 // to find the cause. It may make blocking IO calls in the process.
495 func (a *MasterAnalyzer) reasonsForFailure(f stepFailure) []string { 570 func (a *MasterAnalyzer) reasonsForFailure(f stepFailure) []string {
496 ret := []string{} 571 ret := []string{}
497 recognized := false 572 recognized := false
498 log.Infof("Checking for reasons for failure step: %v", f.step.Name) 573 log.Infof("Checking for reasons for failure step: %v", f.step.Name)
499 for _, sfa := range a.StepAnalyzers { 574 for _, sfa := range a.StepAnalyzers {
500 res, err := sfa.Analyze(f) 575 res, err := sfa.Analyze(f)
501 if err != nil { 576 if err != nil {
577 // TODO: return something that contains errors *and* rea sons.
502 log.Errorf("Error get reasons from StepAnalyzer: %v", er r) 578 log.Errorf("Error get reasons from StepAnalyzer: %v", er r)
579 continue
503 } 580 }
504 if res.Recognized { 581 if res.Recognized {
505 recognized = true 582 recognized = true
506 ret = append(ret, res.Reasons...) 583 ret = append(ret, res.Reasons...)
507 } 584 }
508 } 585 }
509 586
510 if !recognized { 587 if !recognized {
511 // TODO: log and report frequently encountered unrecognized buil der step 588 // TODO: log and report frequently encountered unrecognized buil der step
512 // failure names. 589 // failure names.
(...skipping 39 matching lines...) Expand 10 before | Expand all | Expand 10 after
552 // Sigh. build.chromium.org doesn't accept + as an escaped space in URL paths. 629 // Sigh. build.chromium.org doesn't accept + as an escaped space in URL paths.
553 func oldEscape(s string) string { 630 func oldEscape(s string) string {
554 return strings.Replace(url.QueryEscape(s), "+", "%20", -1) 631 return strings.Replace(url.QueryEscape(s), "+", "%20", -1)
555 } 632 }
556 633
557 // URL returns a url to builder step failure page. 634 // URL returns a url to builder step failure page.
558 func (f stepFailure) URL() string { 635 func (f stepFailure) URL() string {
559 return fmt.Sprintf("https://build.chromium.org/p/%s/builders/%s/builds/% d/steps/%s", 636 return fmt.Sprintf("https://build.chromium.org/p/%s/builders/%s/builds/% d/steps/%s",
560 f.masterName, oldEscape(f.builderName), f.build.Number, oldEscap e(f.step.Name)) 637 f.masterName, oldEscape(f.builderName), f.build.Number, oldEscap e(f.step.Name))
561 } 638 }
OLDNEW
« no previous file with comments | « no previous file | go/src/infra/monitoring/analyzer/analyzer_test.go » ('j') | go/src/infra/monitoring/messages/alerts.go » ('J')

Powered by Google App Engine
This is Rietveld 408576698