go/src/infra/monitoring/analyzer/analyzer.go - Issue 1125263004: dispatcher: fix test result parsing, build ranges for failure alerts, other fixes

Unified Diff: go/src/infra/monitoring/analyzer/analyzer.go

Issue 1125263004: dispatcher: fix test result parsing, build ranges for failure alerts, other fixes (Closed) Base URL: https://chromium.googlesource.com/infra/infra.git@master

Patch Set: added comments, some bounds/type assertion checks Created 5 years, 7 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: go/src/infra/monitoring/analyzer/analyzer.go

diff --git a/go/src/infra/monitoring/analyzer/analyzer.go b/go/src/infra/monitoring/analyzer/analyzer.go

index 24754a5431edf82846887fd77e0da2beaa9e5905..ae05c0f67518ae65f121a76799a555a13efa8cf1 100644

--- a/go/src/infra/monitoring/analyzer/analyzer.go

+++ b/go/src/infra/monitoring/analyzer/analyzer.go

@@ -11,6 +11,7 @@ import (

"path/filepath"

"sort"

"strings"

+ "sync"

"time"

"github.com/Sirupsen/logrus"

@@ -21,7 +22,13 @@ import (

const (

// StepCompletedRun is a synthetic step name used to indicate the build run is complete.

- StepCompletedRun = "completed run"

+ StepCompletedRun = "completed run"

+ treeCloserPri = 0

+ staleMasterSev = 0

+ staleBuilderSev = 0

+ hungBuilderSev = 1

+ idleBuilderSev = 1

+ offlineBuilderSev = 1

)

var (

@@ -56,6 +63,9 @@ type MasterAnalyzer struct {

// MaxRecentBuilds is the maximum number of recent builds to check, per builder.

MaxRecentBuilds int

+ // MinRecentBuilds is the minimum number of recent builds to check, per builder.

+ MinRecentBuilds int

// StepAnalzers are the set of build step failure analyzers to be checked on

// build step failures.

StepAnalyzers []StepAnalyzer

@@ -81,6 +91,14 @@ type MasterAnalyzer struct {

// bCache is a map of build cache key to Build message.

bCache map[string]*messages.Builds

+ // bLock protects bCache

+ bLock *sync.Mutex

+ // These limit the scope analysis, useful for debugging.

+ TreeOnly string

+ MasterOnly string

+ BuilderOnly string

+ BuildOnly int64

// now is useful for mocking the system clock in testing.

now func() time.Time

@@ -88,7 +106,7 @@ type MasterAnalyzer struct {

// New returns a new Analyzer. If client is nil, it assigns a default implementation.

// maxBuilds is the maximum number of builds to check, per builder.

-func New(c client.Client, maxBuilds int) *MasterAnalyzer {

+func New(c client.Client, minBuilds, maxBuilds int) *MasterAnalyzer {

if c == nil {

c = client.New()

}

@@ -96,12 +114,14 @@ func New(c client.Client, maxBuilds int) *MasterAnalyzer {

return &MasterAnalyzer{

Client: c,

MaxRecentBuilds: maxBuilds,

+ MinRecentBuilds: minBuilds,

HungBuilderThresh: 3 * time.Hour,

OfflineBuilderThresh: 90 * time.Minute,

IdleBuilderCountThresh: 50,

StaleMasterThreshold: 10 * time.Minute,

StepAnalyzers: []StepAnalyzer{

&TestFailureAnalyzer{Client: c},

+ &CompileFailureAnalyzer{Client: c},

now: func() time.Time {

@@ -109,6 +129,7 @@ func New(c client.Client, maxBuilds int) *MasterAnalyzer {

bCache: map[string]*messages.Builds{},

+ bLock: &sync.Mutex{},

}

@@ -128,7 +149,7 @@ func (a *MasterAnalyzer) MasterAlerts(url string, be *messages.BuildExtract) []m

Key: fmt.Sprintf("stale master: %v", url),

Title: "Stale Master Data",

Body: fmt.Sprintf("%s elapsed since last update (%s).", elapsed, be.CreatedTimestamp.Time()),

- Severity: 0,

+ Severity: staleMasterSev,

Time: messages.TimeToEpochTime(a.now()),

Links: []messages.Link{{"Master", url}},

// No type or extension for now.

@@ -160,7 +181,12 @@ func (a *MasterAnalyzer) BuilderAlerts(url string, be *messages.BuildExtract) []

// TODO: get a list of all the running builds from be.Slaves? It

// appears to be used later on in the original py.

+ scannedBuilders := []string{}

for bn, b := range be.Builders {

+ if a.BuilderOnly != "" && bn != a.BuilderOnly {

+ continue

+ }

+ scannedBuilders = append(scannedBuilders, bn)

go func(bn string, b messages.Builders) {

out := r{bn: bn, b: b}

defer func() {

@@ -169,7 +195,6 @@ func (a *MasterAnalyzer) BuilderAlerts(url string, be *messages.BuildExtract) []

// This blocks on IO, hence the goroutine.

a.warmBuildCache(mn, bn, b.CachedBuilds)

// Each call to builderAlerts may trigger blocking json fetches,

// but it has a data dependency on the above cache-warming call, so

// the logic remains serial.

@@ -178,7 +203,7 @@ func (a *MasterAnalyzer) BuilderAlerts(url string, be *messages.BuildExtract) []

}

ret := []messages.Alert{}

- for bn := range be.Builders {

+ for _, bn := range scannedBuilders {

r := <-c

if len(r.err) != 0 {

// TODO: add a special alert for this too?

@@ -198,7 +223,11 @@ func masterName(URL string) (string, error) {

return "", err

}

pathParts := strings.Split(mURL.Path, "/")

- return pathParts[len(pathParts)-1], nil

+ if len(pathParts) < 2 {

+ return "", fmt.Errorf("Couldn't parse master name from %s", URL)

+ }

+ // -2 to lop the /json off the http://.../{master.name}/json suffix

+ return pathParts[len(pathParts)-2], nil

}

func cacheKeyForBuild(master, builder string, number int64) string {

@@ -229,14 +258,19 @@ func (a *MasterAnalyzer) warmBuildCache(master, builder string, recentBuildIDs [

// TODO: add FetchBuilds to the client interface. Take a list of {master, builder} and

// return (map[{master, builder}][]Builds, map [{master, builder}]error)

// That way we can do all of these in parallel.

status, err := a.Client.JSON(URL, &res)

if err != nil {

log.Errorf("Error (%d) fetching %s: %s", status, URL, err)

}

+ a.bLock.Lock()

for _, b := range res.Builds {

- a.bCache[cacheKeyForBuild(master, builder, b.Number)] = &b

+ // TODO: consider making res.Builds be []*messages.Builds instead of []messages.Builds

+ ba := b

+ a.bCache[cacheKeyForBuild(master, builder, b.Number)] = &ba

}

+ a.bLock.Unlock()

}

// This type is used for sorting build IDs.

@@ -290,26 +324,64 @@ func (a *MasterAnalyzer) builderAlerts(mn string, bn string, b *messages.Builder

log.Infof("Checking %d most recent builds for alertable step failures: %s/%s", len(recentBuildIDs), mn, bn)

// Check for alertable step failures.

- for _, buildID := range recentBuildIDs {

+ stepFailureAlerts := map[string]messages.Alert{}

+ firstFailureInstance := map[string]int64{}

+ for i, buildID := range recentBuildIDs {

failures, err := a.stepFailures(mn, bn, buildID)

if err != nil {

errs = append(errs, err)

}

+ if len(failures) == 0 && i > a.MinRecentBuilds {

+ // Bail as soon as we find a successful build prior to the most recent couple of builds.

+ break

+ }

as, err := a.stepFailureAlerts(failures)

if err != nil {

errs = append(errs, err)

}

- alerts = append(alerts, as...)

+ lastKey := ""

+ // This probably isn't exactly what we want since multiple builders may have the same alert firing.

+ // This just merges build ranges on a per-builder basis.

+ for _, alr := range as {

+ // Only keep the most recent alert. Since recentBuildIDs is

+ // sorted newest build first, ignore older instances of the alert.

+ if _, ok := stepFailureAlerts[alr.Key]; !ok {

+ stepFailureAlerts[alr.Key] = alr

+ }

+ if alr.Key == lastKey || lastKey == "" {

+ if bf, ok := alr.Extension.(messages.BuildFailure); ok {

+ firstFailureInstance[alr.Key] = bf.Builders[0].FirstFailure

+ } else {

+ log.Errorf("Couldn't cast alert extension to BuildFailure: %s", alr.Key)

+ }

+ lastKey = alr.Key

+ }

}

- // Check for stale builders. Latest build is the first in the list.

+ for _, alr := range stepFailureAlerts {

+ alr.Extension.(messages.BuildFailure).Builders[0].FirstFailure = firstFailureInstance[alr.Key]

+ alerts = append(alerts, alr)

+ }

+ // Check for stale/idle/offline builders. Latest build is the first in the list.

lastBuildID := recentBuildIDs[0]

log.Infof("Checking last build ID: %d", lastBuildID)

// TODO: get this from cache.

- lastBuild, err := a.Client.Build(mn, bn, lastBuildID)

- if err != nil {

- errs = append(errs, fmt.Errorf("Couldn't get latest build %d for %s.%s: %s", lastBuildID, mn, bn, err))

- return alerts, errs

+ a.bLock.Lock()

+ lastBuild := a.bCache[cacheKeyForBuild(mn, bn, lastBuildID)]

+ a.bLock.Unlock()

+ if lastBuild == nil {

+ var err error

+ lastBuild, err = a.Client.Build(mn, bn, lastBuildID)

+ if err != nil {

+ errs = append(errs, fmt.Errorf("Couldn't get latest build %d for %s.%s: %s", lastBuildID, mn, bn, err))

+ return alerts, errs

+ }

}

// Examining only the latest build is probably suboptimal since if it's still in progress it might

@@ -335,7 +407,7 @@ func (a *MasterAnalyzer) builderAlerts(mn string, bn string, b *messages.Builder

Key: fmt.Sprintf("%s.%s.hung", mn, bn),

Title: fmt.Sprintf("%s.%s is hung in step %s.", mn, bn, lastStep),

Body: fmt.Sprintf("%s.%s has been building for %v (last step update %s), past the alerting threshold of %v", mn, bn, elapsed, lastUpdated.Time(), a.HungBuilderThresh),

- Severity: 0,

+ Severity: hungBuilderSev,

Time: messages.TimeToEpochTime(a.now()),

Links: links,

})

@@ -347,8 +419,8 @@ func (a *MasterAnalyzer) builderAlerts(mn string, bn string, b *messages.Builder

alerts = append(alerts, messages.Alert{

Key: fmt.Sprintf("%s.%s.offline", mn, bn),

Title: fmt.Sprintf("%s.%s is offline.", mn, bn),

- Body: fmt.Sprintf("%s.%s has been offline for %v (last step update %s), past the alerting threshold of %v", mn, bn, elapsed, lastUpdated.Time(), a.OfflineBuilderThresh),

- Severity: 0,

+ Body: fmt.Sprintf("%s.%s has been offline for %v (last step update %s %v), past the alerting threshold of %v", mn, bn, elapsed, lastUpdated.Time(), float64(lastUpdated), a.OfflineBuilderThresh),

+ Severity: offlineBuilderSev,

Time: messages.TimeToEpochTime(a.now()),

Links: links,

})

@@ -359,7 +431,7 @@ func (a *MasterAnalyzer) builderAlerts(mn string, bn string, b *messages.Builder

Key: fmt.Sprintf("%s.%s.idle", mn, bn),

Title: fmt.Sprintf("%s.%s is idle with too many pending builds.", mn, bn),

Body: fmt.Sprintf("%s.%s is idle with %d pending builds, past the alerting threshold of %d", mn, bn, b.PendingBuilds, a.IdleBuilderCountThresh),

- Severity: 0,

+ Severity: idleBuilderSev,

Time: messages.TimeToEpochTime(a.now()),

Links: links,

})

@@ -376,7 +448,9 @@ func (a *MasterAnalyzer) stepFailures(mn string, bn string, bID int64) ([]stepFa

cc := cacheKeyForBuild(mn, bn, bID)

var err error // To avoid re-scoping b in the nested conditional below with a :=.

+ a.bLock.Lock()

b, ok := a.bCache[cc]

+ a.bLock.Unlock()

if !ok {

log.Infof("Cache miss for %s", cc)

b, err = a.Client.Build(mn, bn, bID)

@@ -387,6 +461,7 @@ func (a *MasterAnalyzer) stepFailures(mn string, bn string, bID int64) ([]stepFa

}

ret := []stepFailure{}

for _, s := range b.Steps {

if !s.IsFinished || len(s.Results) == 0 {

continue

@@ -429,9 +504,15 @@ func (a *MasterAnalyzer) stepFailureAlerts(failures []stepFailure) ([]messages.A

// Might not need full capacity buffer, since some failures are ignored below.

rs := make(chan res, len(failures))

+ scannedFailures := []stepFailure{}

for _, f := range failures {

// goroutine/channel because the reasonsForFailure call potentially

// blocks on IO.

+ if f.step.Name == "steps" {

+ continue

+ // The actual breaking step will appear later.

+ }

+ scannedFailures = append(scannedFailures, f)

go func(f stepFailure) {

alr := messages.Alert{

Title: fmt.Sprintf("Builder step failure: %s.%s", f.masterName, f.builderName),

@@ -439,14 +520,17 @@ func (a *MasterAnalyzer) stepFailureAlerts(failures []stepFailure) ([]messages.A

Type: "buildfailure",

}

+ // If the builder has been failing on the same step for multiple builds in a row,

+ // we should have only one alert but indicate the range of builds affected.

+ // These are set in FirstFailure and LastFailure.

bf := messages.BuildFailure{

// FIXME: group builders?

Builders: []messages.AlertedBuilder{

{

Name: f.builderName,

- URL: f.URL(),

- FirstFailure: 0,

- LatestFailure: 1,

+ URL: fmt.Sprintf("https://build.chromium.org/p/%s/builders/%s", f.masterName, f.builderName),

+ FirstFailure: f.build.Number,

+ LatestFailure: f.build.Number,

// TODO: RegressionRanges:

@@ -458,29 +542,27 @@ func (a *MasterAnalyzer) stepFailureAlerts(failures []stepFailure) ([]messages.A

bf.Reasons = append(bf.Reasons, messages.Reason{

TestName: r,

Step: f.step.Name,

+ URL: f.URL(),

})

}

alr.Extension = bf

if len(bf.Reasons) == 0 {

- log.Warnf("No reasons for step failure: %s", alertKey(f.masterName, f.builderName, f.step.Name, ""))

- rs <- res{

- f: f,

- }

+ alr.Key = alertKey(f.masterName, f.builderName, f.step.Name, "")

+ log.Warnf("No reasons for step failure: %s", alr.Key)

} else {

// Should the key include all of the reasons?

alr.Key = alertKey(f.masterName, f.builderName, f.step.Name, reasons[0])

- rs <- res{

- f: f,

- a: &alr,

- err: nil,

- }

+ }

+ rs <- res{

+ f: f,

+ a: &alr,

+ err: nil,

}

}(f)

}

- for _ = range failures {

+ for _ = range scannedFailures {

r := <-rs

if r.a != nil {

ret = append(ret, *r.a)

@@ -499,7 +581,9 @@ func (a *MasterAnalyzer) reasonsForFailure(f stepFailure) []string {

for _, sfa := range a.StepAnalyzers {

res, err := sfa.Analyze(f)

if err != nil {

+ // TODO: return something that contains errors *and* reasons.

log.Errorf("Error get reasons from StepAnalyzer: %v", err)

+ continue

}

if res.Recognized {

recognized = true

« no previous file with comments | « no previous file | go/src/infra/monitoring/analyzer/analyzer_test.go » ('j') | go/src/infra/monitoring/messages/alerts.go » ('J')