| Index: go/src/infra/monitoring/analyzer/analyzer.go
|
| diff --git a/go/src/infra/monitoring/analyzer/analyzer.go b/go/src/infra/monitoring/analyzer/analyzer.go
|
| index 24754a5431edf82846887fd77e0da2beaa9e5905..ae05c0f67518ae65f121a76799a555a13efa8cf1 100644
|
| --- a/go/src/infra/monitoring/analyzer/analyzer.go
|
| +++ b/go/src/infra/monitoring/analyzer/analyzer.go
|
| @@ -11,6 +11,7 @@ import (
|
| "path/filepath"
|
| "sort"
|
| "strings"
|
| + "sync"
|
| "time"
|
|
|
| "github.com/Sirupsen/logrus"
|
| @@ -21,7 +22,13 @@ import (
|
|
|
| const (
|
| // StepCompletedRun is a synthetic step name used to indicate the build run is complete.
|
| - StepCompletedRun = "completed run"
|
| + StepCompletedRun = "completed run"
|
| + treeCloserPri = 0
|
| + staleMasterSev = 0
|
| + staleBuilderSev = 0
|
| + hungBuilderSev = 1
|
| + idleBuilderSev = 1
|
| + offlineBuilderSev = 1
|
| )
|
|
|
| var (
|
| @@ -56,6 +63,9 @@ type MasterAnalyzer struct {
|
| // MaxRecentBuilds is the maximum number of recent builds to check, per builder.
|
| MaxRecentBuilds int
|
|
|
| + // MinRecentBuilds is the minimum number of recent builds to check, per builder.
|
| + MinRecentBuilds int
|
| +
|
| // StepAnalzers are the set of build step failure analyzers to be checked on
|
| // build step failures.
|
| StepAnalyzers []StepAnalyzer
|
| @@ -81,6 +91,14 @@ type MasterAnalyzer struct {
|
|
|
| // bCache is a map of build cache key to Build message.
|
| bCache map[string]*messages.Builds
|
| + // bLock protects bCache
|
| + bLock *sync.Mutex
|
| +
|
| + // These limit the scope analysis, useful for debugging.
|
| + TreeOnly string
|
| + MasterOnly string
|
| + BuilderOnly string
|
| + BuildOnly int64
|
|
|
| // now is useful for mocking the system clock in testing.
|
| now func() time.Time
|
| @@ -88,7 +106,7 @@ type MasterAnalyzer struct {
|
|
|
| // New returns a new Analyzer. If client is nil, it assigns a default implementation.
|
| // maxBuilds is the maximum number of builds to check, per builder.
|
| -func New(c client.Client, maxBuilds int) *MasterAnalyzer {
|
| +func New(c client.Client, minBuilds, maxBuilds int) *MasterAnalyzer {
|
| if c == nil {
|
| c = client.New()
|
| }
|
| @@ -96,12 +114,14 @@ func New(c client.Client, maxBuilds int) *MasterAnalyzer {
|
| return &MasterAnalyzer{
|
| Client: c,
|
| MaxRecentBuilds: maxBuilds,
|
| + MinRecentBuilds: minBuilds,
|
| HungBuilderThresh: 3 * time.Hour,
|
| OfflineBuilderThresh: 90 * time.Minute,
|
| IdleBuilderCountThresh: 50,
|
| StaleMasterThreshold: 10 * time.Minute,
|
| StepAnalyzers: []StepAnalyzer{
|
| &TestFailureAnalyzer{Client: c},
|
| + &CompileFailureAnalyzer{Client: c},
|
| },
|
|
|
| now: func() time.Time {
|
| @@ -109,6 +129,7 @@ func New(c client.Client, maxBuilds int) *MasterAnalyzer {
|
| },
|
|
|
| bCache: map[string]*messages.Builds{},
|
| + bLock: &sync.Mutex{},
|
| }
|
| }
|
|
|
| @@ -128,7 +149,7 @@ func (a *MasterAnalyzer) MasterAlerts(url string, be *messages.BuildExtract) []m
|
| Key: fmt.Sprintf("stale master: %v", url),
|
| Title: "Stale Master Data",
|
| Body: fmt.Sprintf("%s elapsed since last update (%s).", elapsed, be.CreatedTimestamp.Time()),
|
| - Severity: 0,
|
| + Severity: staleMasterSev,
|
| Time: messages.TimeToEpochTime(a.now()),
|
| Links: []messages.Link{{"Master", url}},
|
| // No type or extension for now.
|
| @@ -160,7 +181,12 @@ func (a *MasterAnalyzer) BuilderAlerts(url string, be *messages.BuildExtract) []
|
|
|
| // TODO: get a list of all the running builds from be.Slaves? It
|
| // appears to be used later on in the original py.
|
| + scannedBuilders := []string{}
|
| for bn, b := range be.Builders {
|
| + if a.BuilderOnly != "" && bn != a.BuilderOnly {
|
| + continue
|
| + }
|
| + scannedBuilders = append(scannedBuilders, bn)
|
| go func(bn string, b messages.Builders) {
|
| out := r{bn: bn, b: b}
|
| defer func() {
|
| @@ -169,7 +195,6 @@ func (a *MasterAnalyzer) BuilderAlerts(url string, be *messages.BuildExtract) []
|
|
|
| // This blocks on IO, hence the goroutine.
|
| a.warmBuildCache(mn, bn, b.CachedBuilds)
|
| -
|
| // Each call to builderAlerts may trigger blocking json fetches,
|
| // but it has a data dependency on the above cache-warming call, so
|
| // the logic remains serial.
|
| @@ -178,7 +203,7 @@ func (a *MasterAnalyzer) BuilderAlerts(url string, be *messages.BuildExtract) []
|
| }
|
|
|
| ret := []messages.Alert{}
|
| - for bn := range be.Builders {
|
| + for _, bn := range scannedBuilders {
|
| r := <-c
|
| if len(r.err) != 0 {
|
| // TODO: add a special alert for this too?
|
| @@ -198,7 +223,11 @@ func masterName(URL string) (string, error) {
|
| return "", err
|
| }
|
| pathParts := strings.Split(mURL.Path, "/")
|
| - return pathParts[len(pathParts)-1], nil
|
| + if len(pathParts) < 2 {
|
| + return "", fmt.Errorf("Couldn't parse master name from %s", URL)
|
| + }
|
| + // -2 to lop the /json off the http://.../{master.name}/json suffix
|
| + return pathParts[len(pathParts)-2], nil
|
| }
|
|
|
| func cacheKeyForBuild(master, builder string, number int64) string {
|
| @@ -229,14 +258,19 @@ func (a *MasterAnalyzer) warmBuildCache(master, builder string, recentBuildIDs [
|
| // TODO: add FetchBuilds to the client interface. Take a list of {master, builder} and
|
| // return (map[{master, builder}][]Builds, map [{master, builder}]error)
|
| // That way we can do all of these in parallel.
|
| +
|
| status, err := a.Client.JSON(URL, &res)
|
| if err != nil {
|
| log.Errorf("Error (%d) fetching %s: %s", status, URL, err)
|
| }
|
|
|
| + a.bLock.Lock()
|
| for _, b := range res.Builds {
|
| - a.bCache[cacheKeyForBuild(master, builder, b.Number)] = &b
|
| + // TODO: consider making res.Builds be []*messages.Builds instead of []messages.Builds
|
| + ba := b
|
| + a.bCache[cacheKeyForBuild(master, builder, b.Number)] = &ba
|
| }
|
| + a.bLock.Unlock()
|
| }
|
|
|
| // This type is used for sorting build IDs.
|
| @@ -290,26 +324,64 @@ func (a *MasterAnalyzer) builderAlerts(mn string, bn string, b *messages.Builder
|
| log.Infof("Checking %d most recent builds for alertable step failures: %s/%s", len(recentBuildIDs), mn, bn)
|
|
|
| // Check for alertable step failures.
|
| - for _, buildID := range recentBuildIDs {
|
| + stepFailureAlerts := map[string]messages.Alert{}
|
| + firstFailureInstance := map[string]int64{}
|
| +
|
| + for i, buildID := range recentBuildIDs {
|
| failures, err := a.stepFailures(mn, bn, buildID)
|
| if err != nil {
|
| errs = append(errs, err)
|
| }
|
| + if len(failures) == 0 && i > a.MinRecentBuilds {
|
| + // Bail as soon as we find a successful build prior to the most recent couple of builds.
|
| + break
|
| + }
|
| +
|
| as, err := a.stepFailureAlerts(failures)
|
| if err != nil {
|
| errs = append(errs, err)
|
| }
|
| - alerts = append(alerts, as...)
|
| +
|
| + lastKey := ""
|
| + // This probably isn't exactly what we want since multiple builders may have the same alert firing.
|
| + // This just merges build ranges on a per-builder basis.
|
| + for _, alr := range as {
|
| + // Only keep the most recent alert. Since recentBuildIDs is
|
| + // sorted newest build first, ignore older instances of the alert.
|
| + if _, ok := stepFailureAlerts[alr.Key]; !ok {
|
| + stepFailureAlerts[alr.Key] = alr
|
| + }
|
| + if alr.Key == lastKey || lastKey == "" {
|
| + if bf, ok := alr.Extension.(messages.BuildFailure); ok {
|
| + firstFailureInstance[alr.Key] = bf.Builders[0].FirstFailure
|
| + } else {
|
| + log.Errorf("Couldn't cast alert extension to BuildFailure: %s", alr.Key)
|
| + }
|
| + }
|
| + lastKey = alr.Key
|
| + }
|
| }
|
|
|
| - // Check for stale builders. Latest build is the first in the list.
|
| + for _, alr := range stepFailureAlerts {
|
| + alr.Extension.(messages.BuildFailure).Builders[0].FirstFailure = firstFailureInstance[alr.Key]
|
| + alerts = append(alerts, alr)
|
| + }
|
| +
|
| + // Check for stale/idle/offline builders. Latest build is the first in the list.
|
| lastBuildID := recentBuildIDs[0]
|
| log.Infof("Checking last build ID: %d", lastBuildID)
|
| +
|
| // TODO: get this from cache.
|
| - lastBuild, err := a.Client.Build(mn, bn, lastBuildID)
|
| - if err != nil {
|
| - errs = append(errs, fmt.Errorf("Couldn't get latest build %d for %s.%s: %s", lastBuildID, mn, bn, err))
|
| - return alerts, errs
|
| + a.bLock.Lock()
|
| + lastBuild := a.bCache[cacheKeyForBuild(mn, bn, lastBuildID)]
|
| + a.bLock.Unlock()
|
| + if lastBuild == nil {
|
| + var err error
|
| + lastBuild, err = a.Client.Build(mn, bn, lastBuildID)
|
| + if err != nil {
|
| + errs = append(errs, fmt.Errorf("Couldn't get latest build %d for %s.%s: %s", lastBuildID, mn, bn, err))
|
| + return alerts, errs
|
| + }
|
| }
|
|
|
| // Examining only the latest build is probably suboptimal since if it's still in progress it might
|
| @@ -335,7 +407,7 @@ func (a *MasterAnalyzer) builderAlerts(mn string, bn string, b *messages.Builder
|
| Key: fmt.Sprintf("%s.%s.hung", mn, bn),
|
| Title: fmt.Sprintf("%s.%s is hung in step %s.", mn, bn, lastStep),
|
| Body: fmt.Sprintf("%s.%s has been building for %v (last step update %s), past the alerting threshold of %v", mn, bn, elapsed, lastUpdated.Time(), a.HungBuilderThresh),
|
| - Severity: 0,
|
| + Severity: hungBuilderSev,
|
| Time: messages.TimeToEpochTime(a.now()),
|
| Links: links,
|
| })
|
| @@ -347,8 +419,8 @@ func (a *MasterAnalyzer) builderAlerts(mn string, bn string, b *messages.Builder
|
| alerts = append(alerts, messages.Alert{
|
| Key: fmt.Sprintf("%s.%s.offline", mn, bn),
|
| Title: fmt.Sprintf("%s.%s is offline.", mn, bn),
|
| - Body: fmt.Sprintf("%s.%s has been offline for %v (last step update %s), past the alerting threshold of %v", mn, bn, elapsed, lastUpdated.Time(), a.OfflineBuilderThresh),
|
| - Severity: 0,
|
| + Body: fmt.Sprintf("%s.%s has been offline for %v (last step update %s %v), past the alerting threshold of %v", mn, bn, elapsed, lastUpdated.Time(), float64(lastUpdated), a.OfflineBuilderThresh),
|
| + Severity: offlineBuilderSev,
|
| Time: messages.TimeToEpochTime(a.now()),
|
| Links: links,
|
| })
|
| @@ -359,7 +431,7 @@ func (a *MasterAnalyzer) builderAlerts(mn string, bn string, b *messages.Builder
|
| Key: fmt.Sprintf("%s.%s.idle", mn, bn),
|
| Title: fmt.Sprintf("%s.%s is idle with too many pending builds.", mn, bn),
|
| Body: fmt.Sprintf("%s.%s is idle with %d pending builds, past the alerting threshold of %d", mn, bn, b.PendingBuilds, a.IdleBuilderCountThresh),
|
| - Severity: 0,
|
| + Severity: idleBuilderSev,
|
| Time: messages.TimeToEpochTime(a.now()),
|
| Links: links,
|
| })
|
| @@ -376,7 +448,9 @@ func (a *MasterAnalyzer) stepFailures(mn string, bn string, bID int64) ([]stepFa
|
| cc := cacheKeyForBuild(mn, bn, bID)
|
|
|
| var err error // To avoid re-scoping b in the nested conditional below with a :=.
|
| + a.bLock.Lock()
|
| b, ok := a.bCache[cc]
|
| + a.bLock.Unlock()
|
| if !ok {
|
| log.Infof("Cache miss for %s", cc)
|
| b, err = a.Client.Build(mn, bn, bID)
|
| @@ -387,6 +461,7 @@ func (a *MasterAnalyzer) stepFailures(mn string, bn string, bID int64) ([]stepFa
|
| }
|
|
|
| ret := []stepFailure{}
|
| +
|
| for _, s := range b.Steps {
|
| if !s.IsFinished || len(s.Results) == 0 {
|
| continue
|
| @@ -429,9 +504,15 @@ func (a *MasterAnalyzer) stepFailureAlerts(failures []stepFailure) ([]messages.A
|
| // Might not need full capacity buffer, since some failures are ignored below.
|
| rs := make(chan res, len(failures))
|
|
|
| + scannedFailures := []stepFailure{}
|
| for _, f := range failures {
|
| // goroutine/channel because the reasonsForFailure call potentially
|
| // blocks on IO.
|
| + if f.step.Name == "steps" {
|
| + continue
|
| + // The actual breaking step will appear later.
|
| + }
|
| + scannedFailures = append(scannedFailures, f)
|
| go func(f stepFailure) {
|
| alr := messages.Alert{
|
| Title: fmt.Sprintf("Builder step failure: %s.%s", f.masterName, f.builderName),
|
| @@ -439,14 +520,17 @@ func (a *MasterAnalyzer) stepFailureAlerts(failures []stepFailure) ([]messages.A
|
| Type: "buildfailure",
|
| }
|
|
|
| + // If the builder has been failing on the same step for multiple builds in a row,
|
| + // we should have only one alert but indicate the range of builds affected.
|
| + // These are set in FirstFailure and LastFailure.
|
| bf := messages.BuildFailure{
|
| // FIXME: group builders?
|
| Builders: []messages.AlertedBuilder{
|
| {
|
| Name: f.builderName,
|
| - URL: f.URL(),
|
| - FirstFailure: 0,
|
| - LatestFailure: 1,
|
| + URL: fmt.Sprintf("https://build.chromium.org/p/%s/builders/%s", f.masterName, f.builderName),
|
| + FirstFailure: f.build.Number,
|
| + LatestFailure: f.build.Number,
|
| },
|
| },
|
| // TODO: RegressionRanges:
|
| @@ -458,29 +542,27 @@ func (a *MasterAnalyzer) stepFailureAlerts(failures []stepFailure) ([]messages.A
|
| bf.Reasons = append(bf.Reasons, messages.Reason{
|
| TestName: r,
|
| Step: f.step.Name,
|
| + URL: f.URL(),
|
| })
|
| }
|
|
|
| alr.Extension = bf
|
| if len(bf.Reasons) == 0 {
|
| - log.Warnf("No reasons for step failure: %s", alertKey(f.masterName, f.builderName, f.step.Name, ""))
|
| - rs <- res{
|
| - f: f,
|
| - }
|
| + alr.Key = alertKey(f.masterName, f.builderName, f.step.Name, "")
|
| + log.Warnf("No reasons for step failure: %s", alr.Key)
|
| } else {
|
| // Should the key include all of the reasons?
|
| alr.Key = alertKey(f.masterName, f.builderName, f.step.Name, reasons[0])
|
| -
|
| - rs <- res{
|
| - f: f,
|
| - a: &alr,
|
| - err: nil,
|
| - }
|
| + }
|
| + rs <- res{
|
| + f: f,
|
| + a: &alr,
|
| + err: nil,
|
| }
|
| }(f)
|
| }
|
|
|
| - for _ = range failures {
|
| + for _ = range scannedFailures {
|
| r := <-rs
|
| if r.a != nil {
|
| ret = append(ret, *r.a)
|
| @@ -499,7 +581,9 @@ func (a *MasterAnalyzer) reasonsForFailure(f stepFailure) []string {
|
| for _, sfa := range a.StepAnalyzers {
|
| res, err := sfa.Analyze(f)
|
| if err != nil {
|
| + // TODO: return something that contains errors *and* reasons.
|
| log.Errorf("Error get reasons from StepAnalyzer: %v", err)
|
| + continue
|
| }
|
| if res.Recognized {
|
| recognized = true
|
|
|