| OLD | NEW |
| 1 // Copyright 2015 The Chromium Authors. All rights reserved. | 1 // Copyright 2015 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 | 4 |
| 5 package analyzer | 5 package analyzer |
| 6 | 6 |
| 7 import ( | 7 import ( |
| 8 "errors" | 8 "errors" |
| 9 "fmt" | 9 "fmt" |
| 10 "net/url" | 10 "net/url" |
| 11 "path/filepath" | 11 "path/filepath" |
| 12 "sort" | 12 "sort" |
| 13 "strings" | 13 "strings" |
| 14 "sync" |
| 14 "time" | 15 "time" |
| 15 | 16 |
| 16 "github.com/Sirupsen/logrus" | 17 "github.com/Sirupsen/logrus" |
| 17 | 18 |
| 18 "infra/monitoring/client" | 19 "infra/monitoring/client" |
| 19 "infra/monitoring/messages" | 20 "infra/monitoring/messages" |
| 20 ) | 21 ) |
| 21 | 22 |
| 22 const ( | 23 const ( |
| 23 // StepCompletedRun is a synthetic step name used to indicate the build
run is complete. | 24 // StepCompletedRun is a synthetic step name used to indicate the build
run is complete. |
| 24 » StepCompletedRun = "completed run" | 25 » StepCompletedRun = "completed run" |
| 26 » treeCloserPri = 0 |
| 27 » staleMasterSev = 0 |
| 28 » staleBuilderSev = 0 |
| 29 » hungBuilderSev = 1 |
| 30 » idleBuilderSev = 1 |
| 31 » offlineBuilderSev = 1 |
| 25 ) | 32 ) |
| 26 | 33 |
| 27 var ( | 34 var ( |
| 28 log = logrus.New() | 35 log = logrus.New() |
| 29 ) | 36 ) |
| 30 | 37 |
| 31 var ( | 38 var ( |
| 32 errNoBuildSteps = errors.New("No build steps") | 39 errNoBuildSteps = errors.New("No build steps") |
| 33 ) | 40 ) |
| 34 | 41 |
| (...skipping 14 matching lines...) Expand all Loading... |
| 49 // Reasons lists the reasons for the stepFailure determined by the StepF
ailureAnalyzer. | 56 // Reasons lists the reasons for the stepFailure determined by the StepF
ailureAnalyzer. |
| 50 Reasons []string | 57 Reasons []string |
| 51 } | 58 } |
| 52 | 59 |
| 53 // MasterAnalyzer runs the process of checking masters, builders, test results a
nd so on, | 60 // MasterAnalyzer runs the process of checking masters, builders, test results a
nd so on, |
| 54 // in order to produce alerts. | 61 // in order to produce alerts. |
| 55 type MasterAnalyzer struct { | 62 type MasterAnalyzer struct { |
| 56 // MaxRecentBuilds is the maximum number of recent builds to check, per
builder. | 63 // MaxRecentBuilds is the maximum number of recent builds to check, per
builder. |
| 57 MaxRecentBuilds int | 64 MaxRecentBuilds int |
| 58 | 65 |
| 66 // MinRecentBuilds is the minimum number of recent builds to check, per
builder. |
| 67 MinRecentBuilds int |
| 68 |
| 59 // StepAnalzers are the set of build step failure analyzers to be checke
d on | 69 // StepAnalzers are the set of build step failure analyzers to be checke
d on |
| 60 // build step failures. | 70 // build step failures. |
| 61 StepAnalyzers []StepAnalyzer | 71 StepAnalyzers []StepAnalyzer |
| 62 | 72 |
| 63 // Client is the Client implementation for fetching json from CBE, build
s, etc. | 73 // Client is the Client implementation for fetching json from CBE, build
s, etc. |
| 64 Client client.Client | 74 Client client.Client |
| 65 | 75 |
| 66 // HungBuilerThresh is the maxumum length of time a builder may be in st
ate "building" | 76 // HungBuilerThresh is the maxumum length of time a builder may be in st
ate "building" |
| 67 // before triggering a "hung builder" alert. | 77 // before triggering a "hung builder" alert. |
| 68 HungBuilderThresh time.Duration | 78 HungBuilderThresh time.Duration |
| 69 | 79 |
| 70 // OfflineBuilderThresh is the maximum length of time a builder may be i
n state "offline" | 80 // OfflineBuilderThresh is the maximum length of time a builder may be i
n state "offline" |
| 71 // before triggering an "offline builder" alert. | 81 // before triggering an "offline builder" alert. |
| 72 OfflineBuilderThresh time.Duration | 82 OfflineBuilderThresh time.Duration |
| 73 | 83 |
| 74 // IdleBuilderCountThresh is the maximum number of builds a builder may
have in queue | 84 // IdleBuilderCountThresh is the maximum number of builds a builder may
have in queue |
| 75 // while in the "idle" state before triggering an "idle builder" alert. | 85 // while in the "idle" state before triggering an "idle builder" alert. |
| 76 IdleBuilderCountThresh int64 | 86 IdleBuilderCountThresh int64 |
| 77 | 87 |
| 78 // StaleMasterThreshold is the maximum age that master data from CBE can
be before | 88 // StaleMasterThreshold is the maximum age that master data from CBE can
be before |
| 79 // triggering a "stale master" alert. | 89 // triggering a "stale master" alert. |
| 80 StaleMasterThreshold time.Duration | 90 StaleMasterThreshold time.Duration |
| 81 | 91 |
| 82 // bCache is a map of build cache key to Build message. | 92 // bCache is a map of build cache key to Build message. |
| 83 bCache map[string]*messages.Builds | 93 bCache map[string]*messages.Builds |
| 94 // bLock protects bCache |
| 95 bLock *sync.Mutex |
| 96 |
| 97 // These limit the scope analysis, useful for debugging. |
| 98 TreeOnly string |
| 99 MasterOnly string |
| 100 BuilderOnly string |
| 101 BuildOnly int64 |
| 84 | 102 |
| 85 // now is useful for mocking the system clock in testing. | 103 // now is useful for mocking the system clock in testing. |
| 86 now func() time.Time | 104 now func() time.Time |
| 87 } | 105 } |
| 88 | 106 |
| 89 // New returns a new Analyzer. If client is nil, it assigns a default implementa
tion. | 107 // New returns a new Analyzer. If client is nil, it assigns a default implementa
tion. |
| 90 // maxBuilds is the maximum number of builds to check, per builder. | 108 // maxBuilds is the maximum number of builds to check, per builder. |
| 91 func New(c client.Client, maxBuilds int) *MasterAnalyzer { | 109 func New(c client.Client, minBuilds, maxBuilds int) *MasterAnalyzer { |
| 92 if c == nil { | 110 if c == nil { |
| 93 c = client.New() | 111 c = client.New() |
| 94 } | 112 } |
| 95 | 113 |
| 96 return &MasterAnalyzer{ | 114 return &MasterAnalyzer{ |
| 97 Client: c, | 115 Client: c, |
| 98 MaxRecentBuilds: maxBuilds, | 116 MaxRecentBuilds: maxBuilds, |
| 117 MinRecentBuilds: minBuilds, |
| 99 HungBuilderThresh: 3 * time.Hour, | 118 HungBuilderThresh: 3 * time.Hour, |
| 100 OfflineBuilderThresh: 90 * time.Minute, | 119 OfflineBuilderThresh: 90 * time.Minute, |
| 101 IdleBuilderCountThresh: 50, | 120 IdleBuilderCountThresh: 50, |
| 102 StaleMasterThreshold: 10 * time.Minute, | 121 StaleMasterThreshold: 10 * time.Minute, |
| 103 StepAnalyzers: []StepAnalyzer{ | 122 StepAnalyzers: []StepAnalyzer{ |
| 104 &TestFailureAnalyzer{Client: c}, | 123 &TestFailureAnalyzer{Client: c}, |
| 124 &CompileFailureAnalyzer{Client: c}, |
| 105 }, | 125 }, |
| 106 | 126 |
| 107 now: func() time.Time { | 127 now: func() time.Time { |
| 108 return time.Now() | 128 return time.Now() |
| 109 }, | 129 }, |
| 110 | 130 |
| 111 bCache: map[string]*messages.Builds{}, | 131 bCache: map[string]*messages.Builds{}, |
| 132 bLock: &sync.Mutex{}, |
| 112 } | 133 } |
| 113 } | 134 } |
| 114 | 135 |
| 115 // MasterAlerts returns alerts generated from the master at URL. | 136 // MasterAlerts returns alerts generated from the master at URL. |
| 116 func (a *MasterAnalyzer) MasterAlerts(url string, be *messages.BuildExtract) []m
essages.Alert { | 137 func (a *MasterAnalyzer) MasterAlerts(url string, be *messages.BuildExtract) []m
essages.Alert { |
| 117 ret := []messages.Alert{} | 138 ret := []messages.Alert{} |
| 118 | 139 |
| 119 // Copied logic from builder_messages. | 140 // Copied logic from builder_messages. |
| 120 // No created_timestamp should be a warning sign, no? | 141 // No created_timestamp should be a warning sign, no? |
| 121 if be.CreatedTimestamp == messages.EpochTime(0) { | 142 if be.CreatedTimestamp == messages.EpochTime(0) { |
| 122 return ret | 143 return ret |
| 123 } | 144 } |
| 124 | 145 |
| 125 elapsed := a.now().Sub(be.CreatedTimestamp.Time()) | 146 elapsed := a.now().Sub(be.CreatedTimestamp.Time()) |
| 126 if elapsed > a.StaleMasterThreshold { | 147 if elapsed > a.StaleMasterThreshold { |
| 127 ret = append(ret, messages.Alert{ | 148 ret = append(ret, messages.Alert{ |
| 128 Key: fmt.Sprintf("stale master: %v", url), | 149 Key: fmt.Sprintf("stale master: %v", url), |
| 129 Title: "Stale Master Data", | 150 Title: "Stale Master Data", |
| 130 Body: fmt.Sprintf("%s elapsed since last update (%s)
.", elapsed, be.CreatedTimestamp.Time()), | 151 Body: fmt.Sprintf("%s elapsed since last update (%s)
.", elapsed, be.CreatedTimestamp.Time()), |
| 131 » » » Severity: 0, | 152 » » » Severity: staleMasterSev, |
| 132 Time: messages.TimeToEpochTime(a.now()), | 153 Time: messages.TimeToEpochTime(a.now()), |
| 133 Links: []messages.Link{{"Master", url}}, | 154 Links: []messages.Link{{"Master", url}}, |
| 134 // No type or extension for now. | 155 // No type or extension for now. |
| 135 }) | 156 }) |
| 136 } | 157 } |
| 137 if elapsed < 0 { | 158 if elapsed < 0 { |
| 138 // Add this to the alerts returned, rather than just log it? | 159 // Add this to the alerts returned, rather than just log it? |
| 139 log.Errorf("Master %s timestamp is newer than current time (%s):
%s old.", url, a.now(), elapsed) | 160 log.Errorf("Master %s timestamp is newer than current time (%s):
%s old.", url, a.now(), elapsed) |
| 140 } | 161 } |
| 141 | 162 |
| (...skipping 11 matching lines...) Expand all Loading... |
| 153 type r struct { | 174 type r struct { |
| 154 bn string | 175 bn string |
| 155 b messages.Builders | 176 b messages.Builders |
| 156 alerts []messages.Alert | 177 alerts []messages.Alert |
| 157 err []error | 178 err []error |
| 158 } | 179 } |
| 159 c := make(chan r, len(be.Builders)) | 180 c := make(chan r, len(be.Builders)) |
| 160 | 181 |
| 161 // TODO: get a list of all the running builds from be.Slaves? It | 182 // TODO: get a list of all the running builds from be.Slaves? It |
| 162 // appears to be used later on in the original py. | 183 // appears to be used later on in the original py. |
| 184 scannedBuilders := []string{} |
| 163 for bn, b := range be.Builders { | 185 for bn, b := range be.Builders { |
| 186 if a.BuilderOnly != "" && bn != a.BuilderOnly { |
| 187 continue |
| 188 } |
| 189 scannedBuilders = append(scannedBuilders, bn) |
| 164 go func(bn string, b messages.Builders) { | 190 go func(bn string, b messages.Builders) { |
| 165 out := r{bn: bn, b: b} | 191 out := r{bn: bn, b: b} |
| 166 defer func() { | 192 defer func() { |
| 167 c <- out | 193 c <- out |
| 168 }() | 194 }() |
| 169 | 195 |
| 170 // This blocks on IO, hence the goroutine. | 196 // This blocks on IO, hence the goroutine. |
| 171 a.warmBuildCache(mn, bn, b.CachedBuilds) | 197 a.warmBuildCache(mn, bn, b.CachedBuilds) |
| 172 | |
| 173 // Each call to builderAlerts may trigger blocking json
fetches, | 198 // Each call to builderAlerts may trigger blocking json
fetches, |
| 174 // but it has a data dependency on the above cache-warmi
ng call, so | 199 // but it has a data dependency on the above cache-warmi
ng call, so |
| 175 // the logic remains serial. | 200 // the logic remains serial. |
| 176 out.alerts, out.err = a.builderAlerts(mn, bn, &b) | 201 out.alerts, out.err = a.builderAlerts(mn, bn, &b) |
| 177 }(bn, b) | 202 }(bn, b) |
| 178 } | 203 } |
| 179 | 204 |
| 180 ret := []messages.Alert{} | 205 ret := []messages.Alert{} |
| 181 » for bn := range be.Builders { | 206 » for _, bn := range scannedBuilders { |
| 182 r := <-c | 207 r := <-c |
| 183 if len(r.err) != 0 { | 208 if len(r.err) != 0 { |
| 184 // TODO: add a special alert for this too? | 209 // TODO: add a special alert for this too? |
| 185 log.Errorf("Error getting alerts for builder %s: %v", bn
, r.err) | 210 log.Errorf("Error getting alerts for builder %s: %v", bn
, r.err) |
| 186 } else { | 211 } else { |
| 187 ret = append(ret, r.alerts...) | 212 ret = append(ret, r.alerts...) |
| 188 } | 213 } |
| 189 } | 214 } |
| 190 | 215 |
| 191 return ret | 216 return ret |
| 192 } | 217 } |
| 193 | 218 |
| 194 // masterName extracts the name of the master from the master's URL. | 219 // masterName extracts the name of the master from the master's URL. |
| 195 func masterName(URL string) (string, error) { | 220 func masterName(URL string) (string, error) { |
| 196 mURL, err := url.Parse(URL) | 221 mURL, err := url.Parse(URL) |
| 197 if err != nil { | 222 if err != nil { |
| 198 return "", err | 223 return "", err |
| 199 } | 224 } |
| 200 pathParts := strings.Split(mURL.Path, "/") | 225 pathParts := strings.Split(mURL.Path, "/") |
| 201 » return pathParts[len(pathParts)-1], nil | 226 » if len(pathParts) < 2 { |
| 227 » » return "", fmt.Errorf("Couldn't parse master name from %s", URL) |
| 228 » } |
| 229 » // -2 to lop the /json off the http://.../{master.name}/json suffix |
| 230 » return pathParts[len(pathParts)-2], nil |
| 202 } | 231 } |
| 203 | 232 |
| 204 func cacheKeyForBuild(master, builder string, number int64) string { | 233 func cacheKeyForBuild(master, builder string, number int64) string { |
| 205 return filepath.FromSlash( | 234 return filepath.FromSlash( |
| 206 fmt.Sprintf("%s/%s/%d.json", url.QueryEscape(master), url.QueryE
scape(builder), number)) | 235 fmt.Sprintf("%s/%s/%d.json", url.QueryEscape(master), url.QueryE
scape(builder), number)) |
| 207 } | 236 } |
| 208 | 237 |
| 209 // TODO: actually write the on-disk cache. | 238 // TODO: actually write the on-disk cache. |
| 210 func filenameForCacheKey(cc string) string { | 239 func filenameForCacheKey(cc string) string { |
| 211 cc = strings.Replace(cc, "/", "_", -1) | 240 cc = strings.Replace(cc, "/", "_", -1) |
| (...skipping 10 matching lines...) Expand all Loading... |
| 222 v.Add("builder", builder) | 251 v.Add("builder", builder) |
| 223 | 252 |
| 224 URL := fmt.Sprintf("https://chrome-build-extract.appspot.com/get_builds?
%s", v.Encode()) | 253 URL := fmt.Sprintf("https://chrome-build-extract.appspot.com/get_builds?
%s", v.Encode()) |
| 225 res := struct { | 254 res := struct { |
| 226 Builds []messages.Builds `json:"builds"` | 255 Builds []messages.Builds `json:"builds"` |
| 227 }{} | 256 }{} |
| 228 | 257 |
| 229 // TODO: add FetchBuilds to the client interface. Take a list of {master
, builder} and | 258 // TODO: add FetchBuilds to the client interface. Take a list of {master
, builder} and |
| 230 // return (map[{master, builder}][]Builds, map [{master, builder}]error) | 259 // return (map[{master, builder}][]Builds, map [{master, builder}]error) |
| 231 // That way we can do all of these in parallel. | 260 // That way we can do all of these in parallel. |
| 261 |
| 232 status, err := a.Client.JSON(URL, &res) | 262 status, err := a.Client.JSON(URL, &res) |
| 233 if err != nil { | 263 if err != nil { |
| 234 log.Errorf("Error (%d) fetching %s: %s", status, URL, err) | 264 log.Errorf("Error (%d) fetching %s: %s", status, URL, err) |
| 235 } | 265 } |
| 236 | 266 |
| 267 a.bLock.Lock() |
| 237 for _, b := range res.Builds { | 268 for _, b := range res.Builds { |
| 238 » » a.bCache[cacheKeyForBuild(master, builder, b.Number)] = &b | 269 » » // TODO: consider making res.Builds be []*messages.Builds instea
d of []messages.Builds |
| 270 » » ba := b |
| 271 » » a.bCache[cacheKeyForBuild(master, builder, b.Number)] = &ba |
| 239 } | 272 } |
| 273 a.bLock.Unlock() |
| 240 } | 274 } |
| 241 | 275 |
| 242 // This type is used for sorting build IDs. | 276 // This type is used for sorting build IDs. |
| 243 type buildIDs []int64 | 277 type buildIDs []int64 |
| 244 | 278 |
| 245 func (a buildIDs) Len() int { return len(a) } | 279 func (a buildIDs) Len() int { return len(a) } |
| 246 func (a buildIDs) Swap(i, j int) { a[i], a[j] = a[j], a[i] } | 280 func (a buildIDs) Swap(i, j int) { a[i], a[j] = a[j], a[i] } |
| 247 func (a buildIDs) Less(i, j int) bool { return a[i] > a[j] } | 281 func (a buildIDs) Less(i, j int) bool { return a[i] > a[j] } |
| 248 | 282 |
| 249 // latestBuildStep returns the latest build step name and update time, and an er
ror | 283 // latestBuildStep returns the latest build step name and update time, and an er
ror |
| (...skipping 33 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 283 recentBuildIDs = recentBuildIDs[:a.MaxRecentBuilds] | 317 recentBuildIDs = recentBuildIDs[:a.MaxRecentBuilds] |
| 284 } | 318 } |
| 285 if len(recentBuildIDs) == 0 { | 319 if len(recentBuildIDs) == 0 { |
| 286 // TODO: Make an alert for this? | 320 // TODO: Make an alert for this? |
| 287 log.Errorf("No recent builds for %s.%s", mn, bn) | 321 log.Errorf("No recent builds for %s.%s", mn, bn) |
| 288 return alerts, errs | 322 return alerts, errs |
| 289 } | 323 } |
| 290 log.Infof("Checking %d most recent builds for alertable step failures: %
s/%s", len(recentBuildIDs), mn, bn) | 324 log.Infof("Checking %d most recent builds for alertable step failures: %
s/%s", len(recentBuildIDs), mn, bn) |
| 291 | 325 |
| 292 // Check for alertable step failures. | 326 // Check for alertable step failures. |
| 293 » for _, buildID := range recentBuildIDs { | 327 » stepFailureAlerts := map[string]messages.Alert{} |
| 328 » firstFailureInstance := map[string]int64{} |
| 329 |
| 330 » for i, buildID := range recentBuildIDs { |
| 294 failures, err := a.stepFailures(mn, bn, buildID) | 331 failures, err := a.stepFailures(mn, bn, buildID) |
| 295 if err != nil { | 332 if err != nil { |
| 296 errs = append(errs, err) | 333 errs = append(errs, err) |
| 297 } | 334 } |
| 335 if len(failures) == 0 && i > a.MinRecentBuilds { |
| 336 // Bail as soon as we find a successful build prior to t
he most recent couple of builds. |
| 337 break |
| 338 } |
| 339 |
| 298 as, err := a.stepFailureAlerts(failures) | 340 as, err := a.stepFailureAlerts(failures) |
| 299 if err != nil { | 341 if err != nil { |
| 300 errs = append(errs, err) | 342 errs = append(errs, err) |
| 301 } | 343 } |
| 302 » » alerts = append(alerts, as...) | 344 |
| 345 » » lastKey := "" |
| 346 » » // This probably isn't exactly what we want since multiple build
ers may have the same alert firing. |
| 347 » » // This just merges build ranges on a per-builder basis. |
| 348 » » for _, alr := range as { |
| 349 » » » // Only keep the most recent alert. Since recentBuildID
s is |
| 350 » » » // sorted newest build first, ignore older instances of
the alert. |
| 351 » » » if _, ok := stepFailureAlerts[alr.Key]; !ok { |
| 352 » » » » stepFailureAlerts[alr.Key] = alr |
| 353 » » » } |
| 354 » » » if alr.Key == lastKey || lastKey == "" { |
| 355 » » » » if bf, ok := alr.Extension.(messages.BuildFailur
e); ok { |
| 356 » » » » » firstFailureInstance[alr.Key] = bf.Build
ers[0].FirstFailure |
| 357 » » » » } else { |
| 358 » » » » » log.Errorf("Couldn't cast alert extensio
n to BuildFailure: %s", alr.Key) |
| 359 » » » » } |
| 360 » » » } |
| 361 » » » lastKey = alr.Key |
| 362 » » } |
| 303 } | 363 } |
| 304 | 364 |
| 305 » // Check for stale builders. Latest build is the first in the list. | 365 » for _, alr := range stepFailureAlerts { |
| 366 » » alr.Extension.(messages.BuildFailure).Builders[0].FirstFailure =
firstFailureInstance[alr.Key] |
| 367 » » alerts = append(alerts, alr) |
| 368 » } |
| 369 |
| 370 » // Check for stale/idle/offline builders. Latest build is the first in
the list. |
| 306 lastBuildID := recentBuildIDs[0] | 371 lastBuildID := recentBuildIDs[0] |
| 307 log.Infof("Checking last build ID: %d", lastBuildID) | 372 log.Infof("Checking last build ID: %d", lastBuildID) |
| 373 |
| 308 // TODO: get this from cache. | 374 // TODO: get this from cache. |
| 309 » lastBuild, err := a.Client.Build(mn, bn, lastBuildID) | 375 » a.bLock.Lock() |
| 310 » if err != nil { | 376 » lastBuild := a.bCache[cacheKeyForBuild(mn, bn, lastBuildID)] |
| 311 » » errs = append(errs, fmt.Errorf("Couldn't get latest build %d for
%s.%s: %s", lastBuildID, mn, bn, err)) | 377 » a.bLock.Unlock() |
| 312 » » return alerts, errs | 378 » if lastBuild == nil { |
| 379 » » var err error |
| 380 » » lastBuild, err = a.Client.Build(mn, bn, lastBuildID) |
| 381 » » if err != nil { |
| 382 » » » errs = append(errs, fmt.Errorf("Couldn't get latest buil
d %d for %s.%s: %s", lastBuildID, mn, bn, err)) |
| 383 » » » return alerts, errs |
| 384 » » } |
| 313 } | 385 } |
| 314 | 386 |
| 315 // Examining only the latest build is probably suboptimal since if it's
still in progress it might | 387 // Examining only the latest build is probably suboptimal since if it's
still in progress it might |
| 316 // not have hit a step that is going to fail and has failed repeatedly f
or the last few builds. | 388 // not have hit a step that is going to fail and has failed repeatedly f
or the last few builds. |
| 317 // AKA "Reliable failures". TODO: Identify "Reliable failures" | 389 // AKA "Reliable failures". TODO: Identify "Reliable failures" |
| 318 lastStep, lastUpdated, err := a.latestBuildStep(lastBuild) | 390 lastStep, lastUpdated, err := a.latestBuildStep(lastBuild) |
| 319 if err != nil { | 391 if err != nil { |
| 320 errs = append(errs, fmt.Errorf("Couldn't get latest build step f
or %s.%s: %v", mn, bn, err)) | 392 errs = append(errs, fmt.Errorf("Couldn't get latest build step f
or %s.%s: %v", mn, bn, err)) |
| 321 return alerts, errs | 393 return alerts, errs |
| 322 | 394 |
| 323 } | 395 } |
| 324 elapsed := a.now().Sub(lastUpdated.Time()) | 396 elapsed := a.now().Sub(lastUpdated.Time()) |
| 325 links := []messages.Link{ | 397 links := []messages.Link{ |
| 326 {"Builder", fmt.Sprintf("https://build.chromium.org/p/%s/builder
s/%s", mn, bn)}, | 398 {"Builder", fmt.Sprintf("https://build.chromium.org/p/%s/builder
s/%s", mn, bn)}, |
| 327 {"Last build", fmt.Sprintf("https://build.chromium.org/p/%s/buil
ders/%s/builds/%d", mn, bn, lastBuildID)}, | 399 {"Last build", fmt.Sprintf("https://build.chromium.org/p/%s/buil
ders/%s/builds/%d", mn, bn, lastBuildID)}, |
| 328 {"Last build step", fmt.Sprintf("https://build.chromium.org/p/%s
/builders/%s/builds/%d/steps/%s", mn, bn, lastBuildID, lastStep)}, | 400 {"Last build step", fmt.Sprintf("https://build.chromium.org/p/%s
/builders/%s/builds/%d/steps/%s", mn, bn, lastBuildID, lastStep)}, |
| 329 } | 401 } |
| 330 | 402 |
| 331 switch b.State { | 403 switch b.State { |
| 332 case messages.StateBuilding: | 404 case messages.StateBuilding: |
| 333 if elapsed > a.HungBuilderThresh && lastStep != StepCompletedRun
{ | 405 if elapsed > a.HungBuilderThresh && lastStep != StepCompletedRun
{ |
| 334 alerts = append(alerts, messages.Alert{ | 406 alerts = append(alerts, messages.Alert{ |
| 335 Key: fmt.Sprintf("%s.%s.hung", mn, bn), | 407 Key: fmt.Sprintf("%s.%s.hung", mn, bn), |
| 336 Title: fmt.Sprintf("%s.%s is hung in step %s.
", mn, bn, lastStep), | 408 Title: fmt.Sprintf("%s.%s is hung in step %s.
", mn, bn, lastStep), |
| 337 Body: fmt.Sprintf("%s.%s has been building f
or %v (last step update %s), past the alerting threshold of %v", mn, bn, elapsed
, lastUpdated.Time(), a.HungBuilderThresh), | 409 Body: fmt.Sprintf("%s.%s has been building f
or %v (last step update %s), past the alerting threshold of %v", mn, bn, elapsed
, lastUpdated.Time(), a.HungBuilderThresh), |
| 338 » » » » Severity: 0, | 410 » » » » Severity: hungBuilderSev, |
| 339 Time: messages.TimeToEpochTime(a.now()), | 411 Time: messages.TimeToEpochTime(a.now()), |
| 340 Links: links, | 412 Links: links, |
| 341 }) | 413 }) |
| 342 // Note, just because it's building doesn't mean it's in
a good state. If the last N builds | 414 // Note, just because it's building doesn't mean it's in
a good state. If the last N builds |
| 343 // all failed (for some large N) then this might still b
e alertable. | 415 // all failed (for some large N) then this might still b
e alertable. |
| 344 } | 416 } |
| 345 case messages.StateOffline: | 417 case messages.StateOffline: |
| 346 if elapsed > a.OfflineBuilderThresh { | 418 if elapsed > a.OfflineBuilderThresh { |
| 347 alerts = append(alerts, messages.Alert{ | 419 alerts = append(alerts, messages.Alert{ |
| 348 Key: fmt.Sprintf("%s.%s.offline", mn, bn), | 420 Key: fmt.Sprintf("%s.%s.offline", mn, bn), |
| 349 Title: fmt.Sprintf("%s.%s is offline.", mn, b
n), | 421 Title: fmt.Sprintf("%s.%s is offline.", mn, b
n), |
| 350 » » » » Body: fmt.Sprintf("%s.%s has been offline fo
r %v (last step update %s), past the alerting threshold of %v", mn, bn, elapsed,
lastUpdated.Time(), a.OfflineBuilderThresh), | 422 » » » » Body: fmt.Sprintf("%s.%s has been offline fo
r %v (last step update %s %v), past the alerting threshold of %v", mn, bn, elaps
ed, lastUpdated.Time(), float64(lastUpdated), a.OfflineBuilderThresh), |
| 351 » » » » Severity: 0, | 423 » » » » Severity: offlineBuilderSev, |
| 352 Time: messages.TimeToEpochTime(a.now()), | 424 Time: messages.TimeToEpochTime(a.now()), |
| 353 Links: links, | 425 Links: links, |
| 354 }) | 426 }) |
| 355 } | 427 } |
| 356 case messages.StateIdle: | 428 case messages.StateIdle: |
| 357 if b.PendingBuilds > a.IdleBuilderCountThresh { | 429 if b.PendingBuilds > a.IdleBuilderCountThresh { |
| 358 alerts = append(alerts, messages.Alert{ | 430 alerts = append(alerts, messages.Alert{ |
| 359 Key: fmt.Sprintf("%s.%s.idle", mn, bn), | 431 Key: fmt.Sprintf("%s.%s.idle", mn, bn), |
| 360 Title: fmt.Sprintf("%s.%s is idle with too ma
ny pending builds.", mn, bn), | 432 Title: fmt.Sprintf("%s.%s is idle with too ma
ny pending builds.", mn, bn), |
| 361 Body: fmt.Sprintf("%s.%s is idle with %d pen
ding builds, past the alerting threshold of %d", mn, bn, b.PendingBuilds, a.Idle
BuilderCountThresh), | 433 Body: fmt.Sprintf("%s.%s is idle with %d pen
ding builds, past the alerting threshold of %d", mn, bn, b.PendingBuilds, a.Idle
BuilderCountThresh), |
| 362 » » » » Severity: 0, | 434 » » » » Severity: idleBuilderSev, |
| 363 Time: messages.TimeToEpochTime(a.now()), | 435 Time: messages.TimeToEpochTime(a.now()), |
| 364 Links: links, | 436 Links: links, |
| 365 }) | 437 }) |
| 366 } | 438 } |
| 367 default: | 439 default: |
| 368 log.Errorf("Unknown %s.%s builder state: %s", mn, bn, b.State) | 440 log.Errorf("Unknown %s.%s builder state: %s", mn, bn, b.State) |
| 369 } | 441 } |
| 370 | 442 |
| 371 return alerts, errs | 443 return alerts, errs |
| 372 } | 444 } |
| 373 | 445 |
| 374 // stepFailures returns the steps that have failed recently on builder bn. | 446 // stepFailures returns the steps that have failed recently on builder bn. |
| 375 func (a *MasterAnalyzer) stepFailures(mn string, bn string, bID int64) ([]stepFa
ilure, error) { | 447 func (a *MasterAnalyzer) stepFailures(mn string, bn string, bID int64) ([]stepFa
ilure, error) { |
| 376 cc := cacheKeyForBuild(mn, bn, bID) | 448 cc := cacheKeyForBuild(mn, bn, bID) |
| 377 | 449 |
| 378 var err error // To avoid re-scoping b in the nested conditional below w
ith a :=. | 450 var err error // To avoid re-scoping b in the nested conditional below w
ith a :=. |
| 451 a.bLock.Lock() |
| 379 b, ok := a.bCache[cc] | 452 b, ok := a.bCache[cc] |
| 453 a.bLock.Unlock() |
| 380 if !ok { | 454 if !ok { |
| 381 log.Infof("Cache miss for %s", cc) | 455 log.Infof("Cache miss for %s", cc) |
| 382 b, err = a.Client.Build(mn, bn, bID) | 456 b, err = a.Client.Build(mn, bn, bID) |
| 383 if err != nil || b == nil { | 457 if err != nil || b == nil { |
| 384 log.Errorf("Error fetching build: %v", err) | 458 log.Errorf("Error fetching build: %v", err) |
| 385 return nil, err | 459 return nil, err |
| 386 } | 460 } |
| 387 } | 461 } |
| 388 | 462 |
| 389 ret := []stepFailure{} | 463 ret := []stepFailure{} |
| 464 |
| 390 for _, s := range b.Steps { | 465 for _, s := range b.Steps { |
| 391 if !s.IsFinished || len(s.Results) == 0 { | 466 if !s.IsFinished || len(s.Results) == 0 { |
| 392 continue | 467 continue |
| 393 } | 468 } |
| 394 // Because Results in the json data is a homogeneous array, the
unmarshaler | 469 // Because Results in the json data is a homogeneous array, the
unmarshaler |
| 395 // doesn't have any type information to assert about it. We have
to do | 470 // doesn't have any type information to assert about it. We have
to do |
| 396 // some ugly runtime type assertion ourselves. | 471 // some ugly runtime type assertion ourselves. |
| 397 if r, ok := s.Results[0].(float64); ok { | 472 if r, ok := s.Results[0].(float64); ok { |
| 398 if r == 0 || r == 1 { | 473 if r == 0 || r == 1 { |
| 399 // This 0/1 check seems to be a convention or he
uristic. A 0 or 1 | 474 // This 0/1 check seems to be a convention or he
uristic. A 0 or 1 |
| (...skipping 22 matching lines...) Expand all Loading... |
| 422 ret := []messages.Alert{} | 497 ret := []messages.Alert{} |
| 423 type res struct { | 498 type res struct { |
| 424 f stepFailure | 499 f stepFailure |
| 425 a *messages.Alert | 500 a *messages.Alert |
| 426 err error | 501 err error |
| 427 } | 502 } |
| 428 | 503 |
| 429 // Might not need full capacity buffer, since some failures are ignored
below. | 504 // Might not need full capacity buffer, since some failures are ignored
below. |
| 430 rs := make(chan res, len(failures)) | 505 rs := make(chan res, len(failures)) |
| 431 | 506 |
| 507 scannedFailures := []stepFailure{} |
| 432 for _, f := range failures { | 508 for _, f := range failures { |
| 433 // goroutine/channel because the reasonsForFailure call potentia
lly | 509 // goroutine/channel because the reasonsForFailure call potentia
lly |
| 434 // blocks on IO. | 510 // blocks on IO. |
| 511 if f.step.Name == "steps" { |
| 512 continue |
| 513 // The actual breaking step will appear later. |
| 514 } |
| 515 scannedFailures = append(scannedFailures, f) |
| 435 go func(f stepFailure) { | 516 go func(f stepFailure) { |
| 436 alr := messages.Alert{ | 517 alr := messages.Alert{ |
| 437 Title: fmt.Sprintf("Builder step failure: %s.%s"
, f.masterName, f.builderName), | 518 Title: fmt.Sprintf("Builder step failure: %s.%s"
, f.masterName, f.builderName), |
| 438 Time: messages.EpochTime(a.now().Unix()), | 519 Time: messages.EpochTime(a.now().Unix()), |
| 439 Type: "buildfailure", | 520 Type: "buildfailure", |
| 440 } | 521 } |
| 441 | 522 |
| 523 // If the builder has been failing on the same step for
multiple builds in a row, |
| 524 // we should have only one alert but indicate the range
of builds affected. |
| 525 // These are set in FirstFailure and LastFailure. |
| 442 bf := messages.BuildFailure{ | 526 bf := messages.BuildFailure{ |
| 443 // FIXME: group builders? | 527 // FIXME: group builders? |
| 444 Builders: []messages.AlertedBuilder{ | 528 Builders: []messages.AlertedBuilder{ |
| 445 { | 529 { |
| 446 Name: f.builderName, | 530 Name: f.builderName, |
| 447 » » » » » » URL: f.URL(), | 531 » » » » » » URL: fmt.Sprintf("http
s://build.chromium.org/p/%s/builders/%s", f.masterName, f.builderName), |
| 448 » » » » » » FirstFailure: 0, | 532 » » » » » » FirstFailure: f.build.Number, |
| 449 » » » » » » LatestFailure: 1, | 533 » » » » » » LatestFailure: f.build.Number, |
| 450 }, | 534 }, |
| 451 }, | 535 }, |
| 452 // TODO: RegressionRanges: | 536 // TODO: RegressionRanges: |
| 453 // look into Builds.SourceStamp.Changes. | 537 // look into Builds.SourceStamp.Changes. |
| 454 } | 538 } |
| 455 | 539 |
| 456 reasons := a.reasonsForFailure(f) | 540 reasons := a.reasonsForFailure(f) |
| 457 for _, r := range reasons { | 541 for _, r := range reasons { |
| 458 bf.Reasons = append(bf.Reasons, messages.Reason{ | 542 bf.Reasons = append(bf.Reasons, messages.Reason{ |
| 459 TestName: r, | 543 TestName: r, |
| 460 Step: f.step.Name, | 544 Step: f.step.Name, |
| 545 URL: f.URL(), |
| 461 }) | 546 }) |
| 462 } | 547 } |
| 463 | 548 |
| 464 alr.Extension = bf | 549 alr.Extension = bf |
| 465 if len(bf.Reasons) == 0 { | 550 if len(bf.Reasons) == 0 { |
| 466 » » » » log.Warnf("No reasons for step failure: %s", ale
rtKey(f.masterName, f.builderName, f.step.Name, "")) | 551 » » » » alr.Key = alertKey(f.masterName, f.builderName,
f.step.Name, "") |
| 467 » » » » rs <- res{ | 552 » » » » log.Warnf("No reasons for step failure: %s", alr
.Key) |
| 468 » » » » » f: f, | |
| 469 » » » » } | |
| 470 } else { | 553 } else { |
| 471 // Should the key include all of the reasons? | 554 // Should the key include all of the reasons? |
| 472 alr.Key = alertKey(f.masterName, f.builderName,
f.step.Name, reasons[0]) | 555 alr.Key = alertKey(f.masterName, f.builderName,
f.step.Name, reasons[0]) |
| 473 | 556 » » » } |
| 474 » » » » rs <- res{ | 557 » » » rs <- res{ |
| 475 » » » » » f: f, | 558 » » » » f: f, |
| 476 » » » » » a: &alr, | 559 » » » » a: &alr, |
| 477 » » » » » err: nil, | 560 » » » » err: nil, |
| 478 » » » » } | |
| 479 } | 561 } |
| 480 }(f) | 562 }(f) |
| 481 } | 563 } |
| 482 | 564 |
| 483 » for _ = range failures { | 565 » for _ = range scannedFailures { |
| 484 r := <-rs | 566 r := <-rs |
| 485 if r.a != nil { | 567 if r.a != nil { |
| 486 ret = append(ret, *r.a) | 568 ret = append(ret, *r.a) |
| 487 } | 569 } |
| 488 } | 570 } |
| 489 | 571 |
| 490 return ret, nil | 572 return ret, nil |
| 491 } | 573 } |
| 492 | 574 |
| 493 // reasonsForFailure examines the step failure and applies some heuristics to | 575 // reasonsForFailure examines the step failure and applies some heuristics to |
| 494 // to find the cause. It may make blocking IO calls in the process. | 576 // to find the cause. It may make blocking IO calls in the process. |
| 495 func (a *MasterAnalyzer) reasonsForFailure(f stepFailure) []string { | 577 func (a *MasterAnalyzer) reasonsForFailure(f stepFailure) []string { |
| 496 ret := []string{} | 578 ret := []string{} |
| 497 recognized := false | 579 recognized := false |
| 498 log.Infof("Checking for reasons for failure step: %v", f.step.Name) | 580 log.Infof("Checking for reasons for failure step: %v", f.step.Name) |
| 499 for _, sfa := range a.StepAnalyzers { | 581 for _, sfa := range a.StepAnalyzers { |
| 500 res, err := sfa.Analyze(f) | 582 res, err := sfa.Analyze(f) |
| 501 if err != nil { | 583 if err != nil { |
| 584 // TODO: return something that contains errors *and* rea
sons. |
| 502 log.Errorf("Error get reasons from StepAnalyzer: %v", er
r) | 585 log.Errorf("Error get reasons from StepAnalyzer: %v", er
r) |
| 586 continue |
| 503 } | 587 } |
| 504 if res.Recognized { | 588 if res.Recognized { |
| 505 recognized = true | 589 recognized = true |
| 506 ret = append(ret, res.Reasons...) | 590 ret = append(ret, res.Reasons...) |
| 507 } | 591 } |
| 508 } | 592 } |
| 509 | 593 |
| 510 if !recognized { | 594 if !recognized { |
| 511 // TODO: log and report frequently encountered unrecognized buil
der step | 595 // TODO: log and report frequently encountered unrecognized buil
der step |
| 512 // failure names. | 596 // failure names. |
| (...skipping 39 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 552 // Sigh. build.chromium.org doesn't accept + as an escaped space in URL paths. | 636 // Sigh. build.chromium.org doesn't accept + as an escaped space in URL paths. |
| 553 func oldEscape(s string) string { | 637 func oldEscape(s string) string { |
| 554 return strings.Replace(url.QueryEscape(s), "+", "%20", -1) | 638 return strings.Replace(url.QueryEscape(s), "+", "%20", -1) |
| 555 } | 639 } |
| 556 | 640 |
| 557 // URL returns a url to builder step failure page. | 641 // URL returns a url to builder step failure page. |
| 558 func (f stepFailure) URL() string { | 642 func (f stepFailure) URL() string { |
| 559 return fmt.Sprintf("https://build.chromium.org/p/%s/builders/%s/builds/%
d/steps/%s", | 643 return fmt.Sprintf("https://build.chromium.org/p/%s/builders/%s/builds/%
d/steps/%s", |
| 560 f.masterName, oldEscape(f.builderName), f.build.Number, oldEscap
e(f.step.Name)) | 644 f.masterName, oldEscape(f.builderName), f.build.Number, oldEscap
e(f.step.Name)) |
| 561 } | 645 } |
| OLD | NEW |