Chromium Code Reviews| OLD | NEW |
|---|---|
| 1 // Copyright 2015 The Chromium Authors. All rights reserved. | 1 // Copyright 2015 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 | 4 |
| 5 package analyzer | 5 package analyzer |
| 6 | 6 |
| 7 import ( | 7 import ( |
| 8 "errors" | 8 "errors" |
| 9 "fmt" | 9 "fmt" |
| 10 "net/url" | 10 "net/url" |
| 11 "path/filepath" | 11 "path/filepath" |
| 12 "sort" | 12 "sort" |
| 13 "strings" | 13 "strings" |
| 14 "sync" | |
| 14 "time" | 15 "time" |
| 15 | 16 |
| 16 "github.com/Sirupsen/logrus" | 17 "github.com/Sirupsen/logrus" |
| 17 | 18 |
| 18 "infra/monitoring/client" | 19 "infra/monitoring/client" |
| 19 "infra/monitoring/messages" | 20 "infra/monitoring/messages" |
| 20 ) | 21 ) |
| 21 | 22 |
| 22 const ( | 23 const ( |
| 23 // StepCompletedRun is a synthetic step name used to indicate the build run is complete. | 24 // StepCompletedRun is a synthetic step name used to indicate the build run is complete. |
| 24 » StepCompletedRun = "completed run" | 25 » StepCompletedRun = "completed run" |
| 26 » treeCloserPri = 0 | |
| 27 » staleMasterSev = 0 | |
| 28 » staleBuilderSev = 0 | |
| 29 » hungBuilderSev = 1 | |
| 30 » idleBuilderSev = 1 | |
| 31 » offlineBuilderSev = 1 | |
| 25 ) | 32 ) |
| 26 | 33 |
| 27 var ( | 34 var ( |
| 28 log = logrus.New() | 35 log = logrus.New() |
| 29 ) | 36 ) |
| 30 | 37 |
| 31 var ( | 38 var ( |
| 32 errNoBuildSteps = errors.New("No build steps") | 39 errNoBuildSteps = errors.New("No build steps") |
| 33 ) | 40 ) |
| 34 | 41 |
| (...skipping 14 matching lines...) Expand all Loading... | |
| 49 // Reasons lists the reasons for the stepFailure determined by the StepF ailureAnalyzer. | 56 // Reasons lists the reasons for the stepFailure determined by the StepF ailureAnalyzer. |
| 50 Reasons []string | 57 Reasons []string |
| 51 } | 58 } |
| 52 | 59 |
| 53 // MasterAnalyzer runs the process of checking masters, builders, test results a nd so on, | 60 // MasterAnalyzer runs the process of checking masters, builders, test results a nd so on, |
| 54 // in order to produce alerts. | 61 // in order to produce alerts. |
| 55 type MasterAnalyzer struct { | 62 type MasterAnalyzer struct { |
| 56 // MaxRecentBuilds is the maximum number of recent builds to check, per builder. | 63 // MaxRecentBuilds is the maximum number of recent builds to check, per builder. |
| 57 MaxRecentBuilds int | 64 MaxRecentBuilds int |
| 58 | 65 |
| 66 // MinRecentBuilds is the minimum number of recent builds to check, per builder. | |
| 67 MinRecentBuilds int | |
| 68 | |
| 59 // StepAnalzers are the set of build step failure analyzers to be checke d on | 69 // StepAnalzers are the set of build step failure analyzers to be checke d on |
| 60 // build step failures. | 70 // build step failures. |
| 61 StepAnalyzers []StepAnalyzer | 71 StepAnalyzers []StepAnalyzer |
| 62 | 72 |
| 63 // Client is the Client implementation for fetching json from CBE, build s, etc. | 73 // Client is the Client implementation for fetching json from CBE, build s, etc. |
| 64 Client client.Client | 74 Client client.Client |
| 65 | 75 |
| 66 // HungBuilerThresh is the maxumum length of time a builder may be in st ate "building" | 76 // HungBuilerThresh is the maxumum length of time a builder may be in st ate "building" |
| 67 // before triggering a "hung builder" alert. | 77 // before triggering a "hung builder" alert. |
| 68 HungBuilderThresh time.Duration | 78 HungBuilderThresh time.Duration |
| 69 | 79 |
| 70 // OfflineBuilderThresh is the maximum length of time a builder may be i n state "offline" | 80 // OfflineBuilderThresh is the maximum length of time a builder may be i n state "offline" |
| 71 // before triggering an "offline builder" alert. | 81 // before triggering an "offline builder" alert. |
| 72 OfflineBuilderThresh time.Duration | 82 OfflineBuilderThresh time.Duration |
| 73 | 83 |
| 74 // IdleBuilderCountThresh is the maximum number of builds a builder may have in queue | 84 // IdleBuilderCountThresh is the maximum number of builds a builder may have in queue |
| 75 // while in the "idle" state before triggering an "idle builder" alert. | 85 // while in the "idle" state before triggering an "idle builder" alert. |
| 76 IdleBuilderCountThresh int64 | 86 IdleBuilderCountThresh int64 |
| 77 | 87 |
| 78 // StaleMasterThreshold is the maximum age that master data from CBE can be before | 88 // StaleMasterThreshold is the maximum age that master data from CBE can be before |
| 79 // triggering a "stale master" alert. | 89 // triggering a "stale master" alert. |
| 80 StaleMasterThreshold time.Duration | 90 StaleMasterThreshold time.Duration |
| 81 | 91 |
| 82 // bCache is a map of build cache key to Build message. | 92 // bCache is a map of build cache key to Build message. |
| 83 bCache map[string]*messages.Builds | 93 bCache map[string]*messages.Builds |
| 94 // bLock protects bCache | |
| 95 bLock *sync.Mutex | |
| 96 | |
| 97 // These limit the scope analysis, useful for debugging. | |
| 98 TreeOnly string | |
| 99 MasterOnly string | |
| 100 BuilderOnly string | |
| 101 BuildOnly int64 | |
| 84 | 102 |
| 85 // now is useful for mocking the system clock in testing. | 103 // now is useful for mocking the system clock in testing. |
| 86 now func() time.Time | 104 now func() time.Time |
| 87 } | 105 } |
| 88 | 106 |
| 89 // New returns a new Analyzer. If client is nil, it assigns a default implementa tion. | 107 // New returns a new Analyzer. If client is nil, it assigns a default implementa tion. |
| 90 // maxBuilds is the maximum number of builds to check, per builder. | 108 // maxBuilds is the maximum number of builds to check, per builder. |
| 91 func New(c client.Client, maxBuilds int) *MasterAnalyzer { | 109 func New(c client.Client, minBuilds, maxBuilds int) *MasterAnalyzer { |
| 92 if c == nil { | 110 if c == nil { |
| 93 c = client.New() | 111 c = client.New() |
| 94 } | 112 } |
| 95 | 113 |
| 96 return &MasterAnalyzer{ | 114 return &MasterAnalyzer{ |
| 97 Client: c, | 115 Client: c, |
| 98 MaxRecentBuilds: maxBuilds, | 116 MaxRecentBuilds: maxBuilds, |
| 117 MinRecentBuilds: minBuilds, | |
| 99 HungBuilderThresh: 3 * time.Hour, | 118 HungBuilderThresh: 3 * time.Hour, |
| 100 OfflineBuilderThresh: 90 * time.Minute, | 119 OfflineBuilderThresh: 90 * time.Minute, |
| 101 IdleBuilderCountThresh: 50, | 120 IdleBuilderCountThresh: 50, |
| 102 StaleMasterThreshold: 10 * time.Minute, | 121 StaleMasterThreshold: 10 * time.Minute, |
| 103 StepAnalyzers: []StepAnalyzer{ | 122 StepAnalyzers: []StepAnalyzer{ |
| 104 &TestFailureAnalyzer{Client: c}, | 123 &TestFailureAnalyzer{Client: c}, |
| 124 &CompileFailureAnalyzer{Client: c}, | |
| 105 }, | 125 }, |
| 106 | 126 |
| 107 now: func() time.Time { | 127 now: func() time.Time { |
| 108 return time.Now() | 128 return time.Now() |
| 109 }, | 129 }, |
| 110 | 130 |
| 111 bCache: map[string]*messages.Builds{}, | 131 bCache: map[string]*messages.Builds{}, |
| 132 bLock: &sync.Mutex{}, | |
| 112 } | 133 } |
| 113 } | 134 } |
| 114 | 135 |
| 115 // MasterAlerts returns alerts generated from the master at URL. | 136 // MasterAlerts returns alerts generated from the master at URL. |
| 116 func (a *MasterAnalyzer) MasterAlerts(url string, be *messages.BuildExtract) []m essages.Alert { | 137 func (a *MasterAnalyzer) MasterAlerts(url string, be *messages.BuildExtract) []m essages.Alert { |
| 117 ret := []messages.Alert{} | 138 ret := []messages.Alert{} |
| 118 | 139 |
| 119 // Copied logic from builder_messages. | 140 // Copied logic from builder_messages. |
| 120 // No created_timestamp should be a warning sign, no? | 141 // No created_timestamp should be a warning sign, no? |
| 121 if be.CreatedTimestamp == messages.EpochTime(0) { | 142 if be.CreatedTimestamp == messages.EpochTime(0) { |
| 122 return ret | 143 return ret |
| 123 } | 144 } |
| 124 | 145 |
| 125 elapsed := a.now().Sub(be.CreatedTimestamp.Time()) | 146 elapsed := a.now().Sub(be.CreatedTimestamp.Time()) |
| 126 if elapsed > a.StaleMasterThreshold { | 147 if elapsed > a.StaleMasterThreshold { |
| 127 ret = append(ret, messages.Alert{ | 148 ret = append(ret, messages.Alert{ |
| 128 Key: fmt.Sprintf("stale master: %v", url), | 149 Key: fmt.Sprintf("stale master: %v", url), |
| 129 Title: "Stale Master Data", | 150 Title: "Stale Master Data", |
| 130 Body: fmt.Sprintf("%s elapsed since last update (%s) .", elapsed, be.CreatedTimestamp.Time()), | 151 Body: fmt.Sprintf("%s elapsed since last update (%s) .", elapsed, be.CreatedTimestamp.Time()), |
| 131 » » » Severity: 0, | 152 » » » Severity: staleMasterSev, |
| 132 Time: messages.TimeToEpochTime(a.now()), | 153 Time: messages.TimeToEpochTime(a.now()), |
| 133 Links: []messages.Link{{"Master", url}}, | 154 Links: []messages.Link{{"Master", url}}, |
| 134 // No type or extension for now. | 155 // No type or extension for now. |
| 135 }) | 156 }) |
| 136 } | 157 } |
| 137 if elapsed < 0 { | 158 if elapsed < 0 { |
| 138 // Add this to the alerts returned, rather than just log it? | 159 // Add this to the alerts returned, rather than just log it? |
| 139 log.Errorf("Master %s timestamp is newer than current time (%s): %s old.", url, a.now(), elapsed) | 160 log.Errorf("Master %s timestamp is newer than current time (%s): %s old.", url, a.now(), elapsed) |
| 140 } | 161 } |
| 141 | 162 |
| (...skipping 11 matching lines...) Expand all Loading... | |
| 153 type r struct { | 174 type r struct { |
| 154 bn string | 175 bn string |
| 155 b messages.Builders | 176 b messages.Builders |
| 156 alerts []messages.Alert | 177 alerts []messages.Alert |
| 157 err []error | 178 err []error |
| 158 } | 179 } |
| 159 c := make(chan r, len(be.Builders)) | 180 c := make(chan r, len(be.Builders)) |
| 160 | 181 |
| 161 // TODO: get a list of all the running builds from be.Slaves? It | 182 // TODO: get a list of all the running builds from be.Slaves? It |
| 162 // appears to be used later on in the original py. | 183 // appears to be used later on in the original py. |
| 184 scannedBuilders := []string{} | |
| 163 for bn, b := range be.Builders { | 185 for bn, b := range be.Builders { |
| 186 if a.BuilderOnly != "" && bn != a.BuilderOnly { | |
| 187 continue | |
| 188 } | |
| 189 scannedBuilders = append(scannedBuilders, bn) | |
| 164 go func(bn string, b messages.Builders) { | 190 go func(bn string, b messages.Builders) { |
| 165 out := r{bn: bn, b: b} | 191 out := r{bn: bn, b: b} |
| 166 defer func() { | 192 defer func() { |
| 167 c <- out | 193 c <- out |
| 168 }() | 194 }() |
| 169 | 195 |
| 170 // This blocks on IO, hence the goroutine. | 196 // This blocks on IO, hence the goroutine. |
| 171 a.warmBuildCache(mn, bn, b.CachedBuilds) | 197 a.warmBuildCache(mn, bn, b.CachedBuilds) |
| 172 | |
| 173 // Each call to builderAlerts may trigger blocking json fetches, | 198 // Each call to builderAlerts may trigger blocking json fetches, |
| 174 // but it has a data dependency on the above cache-warmi ng call, so | 199 // but it has a data dependency on the above cache-warmi ng call, so |
| 175 // the logic remains serial. | 200 // the logic remains serial. |
| 176 out.alerts, out.err = a.builderAlerts(mn, bn, &b) | 201 out.alerts, out.err = a.builderAlerts(mn, bn, &b) |
| 177 }(bn, b) | 202 }(bn, b) |
| 178 } | 203 } |
| 179 | 204 |
| 180 ret := []messages.Alert{} | 205 ret := []messages.Alert{} |
| 181 » for bn := range be.Builders { | 206 » for _, bn := range scannedBuilders { |
| 182 r := <-c | 207 r := <-c |
| 183 if len(r.err) != 0 { | 208 if len(r.err) != 0 { |
| 184 // TODO: add a special alert for this too? | 209 // TODO: add a special alert for this too? |
| 185 log.Errorf("Error getting alerts for builder %s: %v", bn , r.err) | 210 log.Errorf("Error getting alerts for builder %s: %v", bn , r.err) |
| 186 } else { | 211 } else { |
| 187 ret = append(ret, r.alerts...) | 212 ret = append(ret, r.alerts...) |
| 188 } | 213 } |
| 189 } | 214 } |
| 190 | 215 |
| 191 return ret | 216 return ret |
| 192 } | 217 } |
| 193 | 218 |
| 194 // masterName extracts the name of the master from the master's URL. | 219 // masterName extracts the name of the master from the master's URL. |
| 195 func masterName(URL string) (string, error) { | 220 func masterName(URL string) (string, error) { |
| 196 mURL, err := url.Parse(URL) | 221 mURL, err := url.Parse(URL) |
| 197 if err != nil { | 222 if err != nil { |
| 198 return "", err | 223 return "", err |
| 199 } | 224 } |
| 200 pathParts := strings.Split(mURL.Path, "/") | 225 pathParts := strings.Split(mURL.Path, "/") |
| 201 » return pathParts[len(pathParts)-1], nil | 226 » return pathParts[len(pathParts)-2], nil |
|
Vadim Sh.
2015/05/06 21:40:18
check that len(pathParts) is >= 2. Also it would b
seanmccullough
2015/05/06 22:16:20
To lop the /json off the http://.../{master.name}/
| |
| 202 } | 227 } |
| 203 | 228 |
| 204 func cacheKeyForBuild(master, builder string, number int64) string { | 229 func cacheKeyForBuild(master, builder string, number int64) string { |
| 205 return filepath.FromSlash( | 230 return filepath.FromSlash( |
| 206 fmt.Sprintf("%s/%s/%d.json", url.QueryEscape(master), url.QueryE scape(builder), number)) | 231 fmt.Sprintf("%s/%s/%d.json", url.QueryEscape(master), url.QueryE scape(builder), number)) |
| 207 } | 232 } |
| 208 | 233 |
| 209 // TODO: actually write the on-disk cache. | 234 // TODO: actually write the on-disk cache. |
| 210 func filenameForCacheKey(cc string) string { | 235 func filenameForCacheKey(cc string) string { |
| 211 cc = strings.Replace(cc, "/", "_", -1) | 236 cc = strings.Replace(cc, "/", "_", -1) |
| (...skipping 10 matching lines...) Expand all Loading... | |
| 222 v.Add("builder", builder) | 247 v.Add("builder", builder) |
| 223 | 248 |
| 224 URL := fmt.Sprintf("https://chrome-build-extract.appspot.com/get_builds? %s", v.Encode()) | 249 URL := fmt.Sprintf("https://chrome-build-extract.appspot.com/get_builds? %s", v.Encode()) |
| 225 res := struct { | 250 res := struct { |
| 226 Builds []messages.Builds `json:"builds"` | 251 Builds []messages.Builds `json:"builds"` |
| 227 }{} | 252 }{} |
| 228 | 253 |
| 229 // TODO: add FetchBuilds to the client interface. Take a list of {master , builder} and | 254 // TODO: add FetchBuilds to the client interface. Take a list of {master , builder} and |
| 230 // return (map[{master, builder}][]Builds, map [{master, builder}]error) | 255 // return (map[{master, builder}][]Builds, map [{master, builder}]error) |
| 231 // That way we can do all of these in parallel. | 256 // That way we can do all of these in parallel. |
| 257 | |
| 232 status, err := a.Client.JSON(URL, &res) | 258 status, err := a.Client.JSON(URL, &res) |
| 233 if err != nil { | 259 if err != nil { |
| 234 log.Errorf("Error (%d) fetching %s: %s", status, URL, err) | 260 log.Errorf("Error (%d) fetching %s: %s", status, URL, err) |
| 235 } | 261 } |
| 236 | 262 |
| 263 a.bLock.Lock() | |
| 237 for _, b := range res.Builds { | 264 for _, b := range res.Builds { |
| 238 » » a.bCache[cacheKeyForBuild(master, builder, b.Number)] = &b | 265 » » // TODO: consider making res.Builds be []*messages.Builds instea d of []messages.Builds |
| 266 » » ba := b | |
|
Vadim Sh.
2015/05/06 21:40:18
IIUC, b is already a copy of whatever is in messag
seanmccullough
2015/05/06 22:16:20
Taking the address of a range var always gives you
Vadim Sh.
2015/05/06 22:30:23
:) Good to know. Go pointers + garbage collection
seanmccullough
2015/05/06 22:43:13
Thankfully there aren't quite as many wtf moments
| |
| 267 » » a.bCache[cacheKeyForBuild(master, builder, b.Number)] = &ba | |
| 239 } | 268 } |
| 269 a.bLock.Unlock() | |
| 240 } | 270 } |
| 241 | 271 |
| 242 // This type is used for sorting build IDs. | 272 // This type is used for sorting build IDs. |
| 243 type buildIDs []int64 | 273 type buildIDs []int64 |
| 244 | 274 |
| 245 func (a buildIDs) Len() int { return len(a) } | 275 func (a buildIDs) Len() int { return len(a) } |
| 246 func (a buildIDs) Swap(i, j int) { a[i], a[j] = a[j], a[i] } | 276 func (a buildIDs) Swap(i, j int) { a[i], a[j] = a[j], a[i] } |
| 247 func (a buildIDs) Less(i, j int) bool { return a[i] > a[j] } | 277 func (a buildIDs) Less(i, j int) bool { return a[i] > a[j] } |
| 248 | 278 |
| 249 // latestBuildStep returns the latest build step name and update time, and an er ror | 279 // latestBuildStep returns the latest build step name and update time, and an er ror |
| (...skipping 33 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 283 recentBuildIDs = recentBuildIDs[:a.MaxRecentBuilds] | 313 recentBuildIDs = recentBuildIDs[:a.MaxRecentBuilds] |
| 284 } | 314 } |
| 285 if len(recentBuildIDs) == 0 { | 315 if len(recentBuildIDs) == 0 { |
| 286 // TODO: Make an alert for this? | 316 // TODO: Make an alert for this? |
| 287 log.Errorf("No recent builds for %s.%s", mn, bn) | 317 log.Errorf("No recent builds for %s.%s", mn, bn) |
| 288 return alerts, errs | 318 return alerts, errs |
| 289 } | 319 } |
| 290 log.Infof("Checking %d most recent builds for alertable step failures: % s/%s", len(recentBuildIDs), mn, bn) | 320 log.Infof("Checking %d most recent builds for alertable step failures: % s/%s", len(recentBuildIDs), mn, bn) |
| 291 | 321 |
| 292 // Check for alertable step failures. | 322 // Check for alertable step failures. |
| 293 » for _, buildID := range recentBuildIDs { | 323 » stepFailureAlerts := map[string]messages.Alert{} |
| 324 » firstFailureInstance := map[string]int64{} | |
| 325 | |
| 326 » for i, buildID := range recentBuildIDs { | |
| 294 failures, err := a.stepFailures(mn, bn, buildID) | 327 failures, err := a.stepFailures(mn, bn, buildID) |
| 295 if err != nil { | 328 if err != nil { |
| 296 errs = append(errs, err) | 329 errs = append(errs, err) |
| 297 } | 330 } |
| 331 if len(failures) == 0 && i > a.MinRecentBuilds { | |
| 332 // Bail as soon as we find a successful build prior to t he most recent couple of builds. | |
| 333 break | |
| 334 } | |
| 335 | |
| 298 as, err := a.stepFailureAlerts(failures) | 336 as, err := a.stepFailureAlerts(failures) |
| 299 if err != nil { | 337 if err != nil { |
| 300 errs = append(errs, err) | 338 errs = append(errs, err) |
| 301 } | 339 } |
| 302 » » alerts = append(alerts, as...) | 340 |
| 341 » » lastKey := "" | |
| 342 » » // This probably isn't exactly what we want since multiple build ers may have the same alert firing. | |
| 343 » » // This just merges build ranges on a per-builder basis. | |
| 344 » » for _, alr := range as { | |
| 345 » » » // Only keep the most recent alert. Since recentBuildID s is | |
| 346 » » » // sorted newest build first, ignore older instances of the alert. | |
| 347 » » » if _, ok := stepFailureAlerts[alr.Key]; !ok { | |
| 348 » » » » stepFailureAlerts[alr.Key] = alr | |
| 349 » » » } | |
| 350 » » » if alr.Key == lastKey || lastKey == "" { | |
| 351 » » » » firstFailureInstance[alr.Key] = alr.Extension.(m essages.BuildFailure).Builders[0].FirstFailure | |
|
Vadim Sh.
2015/05/06 21:40:18
This type assertion looks rather magical... Perhap
seanmccullough
2015/05/06 22:16:20
Added a check and log an error if it fails. Not su
| |
| 352 » » » } | |
| 353 » » » lastKey = alr.Key | |
| 354 » » } | |
| 303 } | 355 } |
| 304 | 356 |
| 305 » // Check for stale builders. Latest build is the first in the list. | 357 » for _, alr := range stepFailureAlerts { |
| 358 » » alr.Extension.(messages.BuildFailure).Builders[0].FirstFailure = firstFailureInstance[alr.Key] | |
| 359 » » alerts = append(alerts, alr) | |
| 360 » } | |
| 361 | |
| 362 » // Check for stale/idle/offline builders. Latest build is the first in the list. | |
| 306 lastBuildID := recentBuildIDs[0] | 363 lastBuildID := recentBuildIDs[0] |
| 307 log.Infof("Checking last build ID: %d", lastBuildID) | 364 log.Infof("Checking last build ID: %d", lastBuildID) |
| 365 | |
| 308 // TODO: get this from cache. | 366 // TODO: get this from cache. |
| 309 » lastBuild, err := a.Client.Build(mn, bn, lastBuildID) | 367 » var lastBuild *messages.Builds |
| 310 » if err != nil { | 368 » a.bLock.Lock() |
| 311 » » errs = append(errs, fmt.Errorf("Couldn't get latest build %d for %s.%s: %s", lastBuildID, mn, bn, err)) | 369 » lastBuild = a.bCache[cacheKeyForBuild(mn, bn, lastBuildID)] |
|
Vadim Sh.
2015/05/06 21:40:18
:= and remove var above
seanmccullough
2015/05/06 22:16:20
Done.
| |
| 312 » » return alerts, errs | 370 » a.bLock.Unlock() |
| 371 » if lastBuild == nil { | |
| 372 » » var err error | |
| 373 » » lastBuild, err = a.Client.Build(mn, bn, lastBuildID) | |
| 374 » » if err != nil { | |
| 375 » » » errs = append(errs, fmt.Errorf("Couldn't get latest buil d %d for %s.%s: %s", lastBuildID, mn, bn, err)) | |
| 376 » » » return alerts, errs | |
| 377 » » } | |
| 313 } | 378 } |
| 314 | 379 |
| 315 // Examining only the latest build is probably suboptimal since if it's still in progress it might | 380 // Examining only the latest build is probably suboptimal since if it's still in progress it might |
| 316 // not have hit a step that is going to fail and has failed repeatedly f or the last few builds. | 381 // not have hit a step that is going to fail and has failed repeatedly f or the last few builds. |
| 317 // AKA "Reliable failures". TODO: Identify "Reliable failures" | 382 // AKA "Reliable failures". TODO: Identify "Reliable failures" |
| 318 lastStep, lastUpdated, err := a.latestBuildStep(lastBuild) | 383 lastStep, lastUpdated, err := a.latestBuildStep(lastBuild) |
| 319 if err != nil { | 384 if err != nil { |
| 320 errs = append(errs, fmt.Errorf("Couldn't get latest build step f or %s.%s: %v", mn, bn, err)) | 385 errs = append(errs, fmt.Errorf("Couldn't get latest build step f or %s.%s: %v", mn, bn, err)) |
| 321 return alerts, errs | 386 return alerts, errs |
| 322 | 387 |
| 323 } | 388 } |
| 324 elapsed := a.now().Sub(lastUpdated.Time()) | 389 elapsed := a.now().Sub(lastUpdated.Time()) |
| 325 links := []messages.Link{ | 390 links := []messages.Link{ |
| 326 {"Builder", fmt.Sprintf("https://build.chromium.org/p/%s/builder s/%s", mn, bn)}, | 391 {"Builder", fmt.Sprintf("https://build.chromium.org/p/%s/builder s/%s", mn, bn)}, |
| 327 {"Last build", fmt.Sprintf("https://build.chromium.org/p/%s/buil ders/%s/builds/%d", mn, bn, lastBuildID)}, | 392 {"Last build", fmt.Sprintf("https://build.chromium.org/p/%s/buil ders/%s/builds/%d", mn, bn, lastBuildID)}, |
| 328 {"Last build step", fmt.Sprintf("https://build.chromium.org/p/%s /builders/%s/builds/%d/steps/%s", mn, bn, lastBuildID, lastStep)}, | 393 {"Last build step", fmt.Sprintf("https://build.chromium.org/p/%s /builders/%s/builds/%d/steps/%s", mn, bn, lastBuildID, lastStep)}, |
| 329 } | 394 } |
| 330 | 395 |
| 331 switch b.State { | 396 switch b.State { |
| 332 case messages.StateBuilding: | 397 case messages.StateBuilding: |
| 333 if elapsed > a.HungBuilderThresh && lastStep != StepCompletedRun { | 398 if elapsed > a.HungBuilderThresh && lastStep != StepCompletedRun { |
| 334 alerts = append(alerts, messages.Alert{ | 399 alerts = append(alerts, messages.Alert{ |
| 335 Key: fmt.Sprintf("%s.%s.hung", mn, bn), | 400 Key: fmt.Sprintf("%s.%s.hung", mn, bn), |
| 336 Title: fmt.Sprintf("%s.%s is hung in step %s. ", mn, bn, lastStep), | 401 Title: fmt.Sprintf("%s.%s is hung in step %s. ", mn, bn, lastStep), |
| 337 Body: fmt.Sprintf("%s.%s has been building f or %v (last step update %s), past the alerting threshold of %v", mn, bn, elapsed , lastUpdated.Time(), a.HungBuilderThresh), | 402 Body: fmt.Sprintf("%s.%s has been building f or %v (last step update %s), past the alerting threshold of %v", mn, bn, elapsed , lastUpdated.Time(), a.HungBuilderThresh), |
| 338 » » » » Severity: 0, | 403 » » » » Severity: hungBuilderSev, |
| 339 Time: messages.TimeToEpochTime(a.now()), | 404 Time: messages.TimeToEpochTime(a.now()), |
| 340 Links: links, | 405 Links: links, |
| 341 }) | 406 }) |
| 342 // Note, just because it's building doesn't mean it's in a good state. If the last N builds | 407 // Note, just because it's building doesn't mean it's in a good state. If the last N builds |
| 343 // all failed (for some large N) then this might still b e alertable. | 408 // all failed (for some large N) then this might still b e alertable. |
| 344 } | 409 } |
| 345 case messages.StateOffline: | 410 case messages.StateOffline: |
| 346 if elapsed > a.OfflineBuilderThresh { | 411 if elapsed > a.OfflineBuilderThresh { |
| 347 alerts = append(alerts, messages.Alert{ | 412 alerts = append(alerts, messages.Alert{ |
| 348 Key: fmt.Sprintf("%s.%s.offline", mn, bn), | 413 Key: fmt.Sprintf("%s.%s.offline", mn, bn), |
| 349 Title: fmt.Sprintf("%s.%s is offline.", mn, b n), | 414 Title: fmt.Sprintf("%s.%s is offline.", mn, b n), |
| 350 » » » » Body: fmt.Sprintf("%s.%s has been offline fo r %v (last step update %s), past the alerting threshold of %v", mn, bn, elapsed, lastUpdated.Time(), a.OfflineBuilderThresh), | 415 » » » » Body: fmt.Sprintf("%s.%s has been offline fo r %v (last step update %s %v), past the alerting threshold of %v", mn, bn, elaps ed, lastUpdated.Time(), float64(lastUpdated), a.OfflineBuilderThresh), |
| 351 » » » » Severity: 0, | 416 » » » » Severity: offlineBuilderSev, |
| 352 Time: messages.TimeToEpochTime(a.now()), | 417 Time: messages.TimeToEpochTime(a.now()), |
| 353 Links: links, | 418 Links: links, |
| 354 }) | 419 }) |
| 355 } | 420 } |
| 356 case messages.StateIdle: | 421 case messages.StateIdle: |
| 357 if b.PendingBuilds > a.IdleBuilderCountThresh { | 422 if b.PendingBuilds > a.IdleBuilderCountThresh { |
| 358 alerts = append(alerts, messages.Alert{ | 423 alerts = append(alerts, messages.Alert{ |
| 359 Key: fmt.Sprintf("%s.%s.idle", mn, bn), | 424 Key: fmt.Sprintf("%s.%s.idle", mn, bn), |
| 360 Title: fmt.Sprintf("%s.%s is idle with too ma ny pending builds.", mn, bn), | 425 Title: fmt.Sprintf("%s.%s is idle with too ma ny pending builds.", mn, bn), |
| 361 Body: fmt.Sprintf("%s.%s is idle with %d pen ding builds, past the alerting threshold of %d", mn, bn, b.PendingBuilds, a.Idle BuilderCountThresh), | 426 Body: fmt.Sprintf("%s.%s is idle with %d pen ding builds, past the alerting threshold of %d", mn, bn, b.PendingBuilds, a.Idle BuilderCountThresh), |
| 362 » » » » Severity: 0, | 427 » » » » Severity: idleBuilderSev, |
| 363 Time: messages.TimeToEpochTime(a.now()), | 428 Time: messages.TimeToEpochTime(a.now()), |
| 364 Links: links, | 429 Links: links, |
| 365 }) | 430 }) |
| 366 } | 431 } |
| 367 default: | 432 default: |
| 368 log.Errorf("Unknown %s.%s builder state: %s", mn, bn, b.State) | 433 log.Errorf("Unknown %s.%s builder state: %s", mn, bn, b.State) |
| 369 } | 434 } |
| 370 | 435 |
| 371 return alerts, errs | 436 return alerts, errs |
| 372 } | 437 } |
| 373 | 438 |
| 374 // stepFailures returns the steps that have failed recently on builder bn. | 439 // stepFailures returns the steps that have failed recently on builder bn. |
| 375 func (a *MasterAnalyzer) stepFailures(mn string, bn string, bID int64) ([]stepFa ilure, error) { | 440 func (a *MasterAnalyzer) stepFailures(mn string, bn string, bID int64) ([]stepFa ilure, error) { |
| 376 cc := cacheKeyForBuild(mn, bn, bID) | 441 cc := cacheKeyForBuild(mn, bn, bID) |
| 377 | 442 |
| 378 var err error // To avoid re-scoping b in the nested conditional below w ith a :=. | 443 var err error // To avoid re-scoping b in the nested conditional below w ith a :=. |
| 444 a.bLock.Lock() | |
| 379 b, ok := a.bCache[cc] | 445 b, ok := a.bCache[cc] |
| 446 a.bLock.Unlock() | |
| 380 if !ok { | 447 if !ok { |
| 381 log.Infof("Cache miss for %s", cc) | 448 log.Infof("Cache miss for %s", cc) |
| 382 b, err = a.Client.Build(mn, bn, bID) | 449 b, err = a.Client.Build(mn, bn, bID) |
| 383 if err != nil || b == nil { | 450 if err != nil || b == nil { |
| 384 log.Errorf("Error fetching build: %v", err) | 451 log.Errorf("Error fetching build: %v", err) |
| 385 return nil, err | 452 return nil, err |
| 386 } | 453 } |
| 387 } | 454 } |
| 388 | 455 |
| 389 ret := []stepFailure{} | 456 ret := []stepFailure{} |
| 457 | |
| 390 for _, s := range b.Steps { | 458 for _, s := range b.Steps { |
| 391 if !s.IsFinished || len(s.Results) == 0 { | 459 if !s.IsFinished || len(s.Results) == 0 { |
| 392 continue | 460 continue |
| 393 } | 461 } |
| 394 // Because Results in the json data is a homogeneous array, the unmarshaler | 462 // Because Results in the json data is a homogeneous array, the unmarshaler |
| 395 // doesn't have any type information to assert about it. We have to do | 463 // doesn't have any type information to assert about it. We have to do |
| 396 // some ugly runtime type assertion ourselves. | 464 // some ugly runtime type assertion ourselves. |
| 397 if r, ok := s.Results[0].(float64); ok { | 465 if r, ok := s.Results[0].(float64); ok { |
| 398 if r == 0 || r == 1 { | 466 if r == 0 || r == 1 { |
| 399 // This 0/1 check seems to be a convention or he uristic. A 0 or 1 | 467 // This 0/1 check seems to be a convention or he uristic. A 0 or 1 |
| (...skipping 22 matching lines...) Expand all Loading... | |
| 422 ret := []messages.Alert{} | 490 ret := []messages.Alert{} |
| 423 type res struct { | 491 type res struct { |
| 424 f stepFailure | 492 f stepFailure |
| 425 a *messages.Alert | 493 a *messages.Alert |
| 426 err error | 494 err error |
| 427 } | 495 } |
| 428 | 496 |
| 429 // Might not need full capacity buffer, since some failures are ignored below. | 497 // Might not need full capacity buffer, since some failures are ignored below. |
| 430 rs := make(chan res, len(failures)) | 498 rs := make(chan res, len(failures)) |
| 431 | 499 |
| 500 scannedFailures := []stepFailure{} | |
| 432 for _, f := range failures { | 501 for _, f := range failures { |
| 433 // goroutine/channel because the reasonsForFailure call potentia lly | 502 // goroutine/channel because the reasonsForFailure call potentia lly |
| 434 // blocks on IO. | 503 // blocks on IO. |
| 504 if f.step.Name == "steps" { | |
| 505 continue | |
| 506 // The actual breaking step will appear later. | |
| 507 } | |
| 508 scannedFailures = append(scannedFailures, f) | |
| 435 go func(f stepFailure) { | 509 go func(f stepFailure) { |
| 436 alr := messages.Alert{ | 510 alr := messages.Alert{ |
| 437 Title: fmt.Sprintf("Builder step failure: %s.%s" , f.masterName, f.builderName), | 511 Title: fmt.Sprintf("Builder step failure: %s.%s" , f.masterName, f.builderName), |
| 438 Time: messages.EpochTime(a.now().Unix()), | 512 Time: messages.EpochTime(a.now().Unix()), |
| 439 Type: "buildfailure", | 513 Type: "buildfailure", |
| 440 } | 514 } |
| 441 | 515 |
| 516 // If the builder has been failing on the same step for multiple builds in a row, | |
| 517 // we should have only one alert but indicate the range of builds affected. | |
| 518 // These are set in FirstFailure and LastFailure. | |
| 442 bf := messages.BuildFailure{ | 519 bf := messages.BuildFailure{ |
| 443 // FIXME: group builders? | 520 // FIXME: group builders? |
| 444 Builders: []messages.AlertedBuilder{ | 521 Builders: []messages.AlertedBuilder{ |
| 445 { | 522 { |
| 446 Name: f.builderName, | 523 Name: f.builderName, |
| 447 » » » » » » URL: f.URL(), | 524 » » » » » » URL: fmt.Sprintf("http s://build.chromium.org/p/%s/builders/%s", f.masterName, f.builderName), |
| 448 » » » » » » FirstFailure: 0, | 525 » » » » » » FirstFailure: f.build.Number, |
| 449 » » » » » » LatestFailure: 1, | 526 » » » » » » LatestFailure: f.build.Number, |
| 450 }, | 527 }, |
| 451 }, | 528 }, |
| 452 // TODO: RegressionRanges: | 529 // TODO: RegressionRanges: |
| 453 // look into Builds.SourceStamp.Changes. | 530 // look into Builds.SourceStamp.Changes. |
| 454 } | 531 } |
| 455 | 532 |
| 456 reasons := a.reasonsForFailure(f) | 533 reasons := a.reasonsForFailure(f) |
| 457 for _, r := range reasons { | 534 for _, r := range reasons { |
| 458 bf.Reasons = append(bf.Reasons, messages.Reason{ | 535 bf.Reasons = append(bf.Reasons, messages.Reason{ |
| 459 TestName: r, | 536 TestName: r, |
| 460 Step: f.step.Name, | 537 Step: f.step.Name, |
| 538 URL: f.URL(), | |
| 461 }) | 539 }) |
| 462 } | 540 } |
| 463 | 541 |
| 464 alr.Extension = bf | 542 alr.Extension = bf |
| 465 if len(bf.Reasons) == 0 { | 543 if len(bf.Reasons) == 0 { |
| 466 » » » » log.Warnf("No reasons for step failure: %s", ale rtKey(f.masterName, f.builderName, f.step.Name, "")) | 544 » » » » alr.Key = alertKey(f.masterName, f.builderName, f.step.Name, "") |
| 467 » » » » rs <- res{ | 545 » » » » log.Warnf("No reasons for step failure: %s", alr .Key) |
| 468 » » » » » f: f, | |
| 469 » » » » } | |
| 470 } else { | 546 } else { |
| 471 // Should the key include all of the reasons? | 547 // Should the key include all of the reasons? |
| 472 alr.Key = alertKey(f.masterName, f.builderName, f.step.Name, reasons[0]) | 548 alr.Key = alertKey(f.masterName, f.builderName, f.step.Name, reasons[0]) |
| 473 | 549 » » » } |
| 474 » » » » rs <- res{ | 550 » » » rs <- res{ |
| 475 » » » » » f: f, | 551 » » » » f: f, |
| 476 » » » » » a: &alr, | 552 » » » » a: &alr, |
| 477 » » » » » err: nil, | 553 » » » » err: nil, |
| 478 » » » » } | |
| 479 } | 554 } |
| 480 }(f) | 555 }(f) |
| 481 } | 556 } |
| 482 | 557 |
| 483 » for _ = range failures { | 558 » for _ = range scannedFailures { |
| 484 r := <-rs | 559 r := <-rs |
| 485 if r.a != nil { | 560 if r.a != nil { |
| 486 ret = append(ret, *r.a) | 561 ret = append(ret, *r.a) |
| 487 } | 562 } |
| 488 } | 563 } |
| 489 | 564 |
| 490 return ret, nil | 565 return ret, nil |
| 491 } | 566 } |
| 492 | 567 |
| 493 // reasonsForFailure examines the step failure and applies some heuristics to | 568 // reasonsForFailure examines the step failure and applies some heuristics to |
| 494 // to find the cause. It may make blocking IO calls in the process. | 569 // to find the cause. It may make blocking IO calls in the process. |
| 495 func (a *MasterAnalyzer) reasonsForFailure(f stepFailure) []string { | 570 func (a *MasterAnalyzer) reasonsForFailure(f stepFailure) []string { |
| 496 ret := []string{} | 571 ret := []string{} |
| 497 recognized := false | 572 recognized := false |
| 498 log.Infof("Checking for reasons for failure step: %v", f.step.Name) | 573 log.Infof("Checking for reasons for failure step: %v", f.step.Name) |
| 499 for _, sfa := range a.StepAnalyzers { | 574 for _, sfa := range a.StepAnalyzers { |
| 500 res, err := sfa.Analyze(f) | 575 res, err := sfa.Analyze(f) |
| 501 if err != nil { | 576 if err != nil { |
| 577 // TODO: return something that contains errors *and* rea sons. | |
| 502 log.Errorf("Error get reasons from StepAnalyzer: %v", er r) | 578 log.Errorf("Error get reasons from StepAnalyzer: %v", er r) |
| 579 continue | |
| 503 } | 580 } |
| 504 if res.Recognized { | 581 if res.Recognized { |
| 505 recognized = true | 582 recognized = true |
| 506 ret = append(ret, res.Reasons...) | 583 ret = append(ret, res.Reasons...) |
| 507 } | 584 } |
| 508 } | 585 } |
| 509 | 586 |
| 510 if !recognized { | 587 if !recognized { |
| 511 // TODO: log and report frequently encountered unrecognized buil der step | 588 // TODO: log and report frequently encountered unrecognized buil der step |
| 512 // failure names. | 589 // failure names. |
| (...skipping 39 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 552 // Sigh. build.chromium.org doesn't accept + as an escaped space in URL paths. | 629 // Sigh. build.chromium.org doesn't accept + as an escaped space in URL paths. |
| 553 func oldEscape(s string) string { | 630 func oldEscape(s string) string { |
| 554 return strings.Replace(url.QueryEscape(s), "+", "%20", -1) | 631 return strings.Replace(url.QueryEscape(s), "+", "%20", -1) |
| 555 } | 632 } |
| 556 | 633 |
| 557 // URL returns a url to builder step failure page. | 634 // URL returns a url to builder step failure page. |
| 558 func (f stepFailure) URL() string { | 635 func (f stepFailure) URL() string { |
| 559 return fmt.Sprintf("https://build.chromium.org/p/%s/builders/%s/builds/% d/steps/%s", | 636 return fmt.Sprintf("https://build.chromium.org/p/%s/builders/%s/builds/% d/steps/%s", |
| 560 f.masterName, oldEscape(f.builderName), f.build.Number, oldEscap e(f.step.Name)) | 637 f.masterName, oldEscape(f.builderName), f.build.Number, oldEscap e(f.step.Name)) |
| 561 } | 638 } |
| OLD | NEW |