Chromium Code Reviews| Index: go/src/infra/monitoring/analyzer/analyzer.go |
| diff --git a/go/src/infra/monitoring/analyzer/analyzer.go b/go/src/infra/monitoring/analyzer/analyzer.go |
| index 999f0796fafd7675fc2e288d1a1ad0f190c8aaea..abe054d41596fa6939966d4b8b69ccd9a4c60ba4 100644 |
| --- a/go/src/infra/monitoring/analyzer/analyzer.go |
| +++ b/go/src/infra/monitoring/analyzer/analyzer.go |
| @@ -8,8 +8,10 @@ import ( |
| "errors" |
| "expvar" |
| "fmt" |
| + "regexp" |
| "sort" |
| "strings" |
| + "sync" |
| "time" |
| "github.com/luci/luci-go/common/logging/gologger" |
| @@ -20,22 +22,28 @@ import ( |
| const ( |
| // StepCompletedRun is a synthetic step name used to indicate the build run is complete. |
| - StepCompletedRun = "completed run" |
| - treeCloserPri = 0 |
| - reliableFailureSev = 0 |
| - newFailureSev = 1 |
| - staleMasterSev = 0 |
| - staleBuilderSev = 0 |
| - hungBuilderSev = 1 |
| - idleBuilderSev = 1 |
| - offlineBuilderSev = 1 |
| - resOK = float64(1) |
| - resInfraFailure = float64(4) |
| + StepCompletedRun = "completed run" |
| + |
| + // Order of severity, worst to least bad. |
| + treeCloserSev = iota |
| + staleMasterSev |
| + infraFailureSev |
| + reliableFailureSev |
| + newFailureSev |
| + staleBuilderSev |
| + hungBuilderSev |
| + idleBuilderSev |
| + offlineBuilderSev |
| + |
| + // Step result values. |
| + resOK = float64(1) |
| + resInfraFailure = float64(4) |
| ) |
| var ( |
| log = gologger.Get() |
| expvars = expvar.NewMap("analyzer") |
| + cpRE = regexp.MustCompile("Cr-Commit-Position: (.*)@{#([0-9]+)}") |
| ) |
| var ( |
| @@ -94,14 +102,15 @@ type Analyzer struct { |
| // triggering a "stale master" alert. |
| StaleMasterThreshold time.Duration |
| - // MasterCfgs is a map of master name to MasterConfig |
| - MasterCfgs map[string]messages.MasterConfig |
| + // Gatekeeper is a the parsed gatekeeper.json config file. |
| + Gatekeeper *GatekeeperRules |
| // These limit the scope analysis, useful for debugging. |
| MasterOnly string |
| BuilderOnly string |
| BuildOnly int64 |
| + rslck *sync.Mutex |
|
martiniss
2016/04/12 22:02:54
could you put a comment saying what resources this
|
| revisionSummaries map[string]messages.RevisionSummary |
| // Now is useful for mocking the system clock in testing and simulating time |
| @@ -128,8 +137,8 @@ func New(c client.Reader, minBuilds, maxBuilds int) *Analyzer { |
| &TestFailureAnalyzer{Reader: c}, |
| &CompileFailureAnalyzer{Reader: c}, |
| }, |
| - MasterCfgs: map[string]messages.MasterConfig{}, |
| - |
| + Gatekeeper: &GatekeeperRules{}, |
| + rslck: &sync.Mutex{}, |
|
martiniss
2016/04/12 22:02:54
could you put a comment saying what resources this
seanmccullough
2016/04/12 22:13:42
Done.
|
| revisionSummaries: map[string]messages.RevisionSummary{}, |
| Now: func() time.Time { |
| return time.Now() |
| @@ -153,12 +162,13 @@ func (a *Analyzer) MasterAlerts(master string, be *messages.BuildExtract) []mess |
| ret = append(ret, messages.Alert{ |
| Key: fmt.Sprintf("stale master: %v", master), |
| Title: fmt.Sprintf("Stale %s master data", master), |
| - Body: fmt.Sprintf("%s elapsed since last update.", elapsed), |
| + Body: fmt.Sprintf("%dh %2dm elapsed since last update.", int(elapsed.Hours()), int(elapsed.Minutes())), |
| StartTime: messages.TimeToEpochTime(be.CreatedTimestamp.Time()), |
| Severity: staleMasterSev, |
| Time: messages.TimeToEpochTime(a.Now()), |
| Links: []messages.Link{{"Master", client.MasterURL(master)}}, |
| - // No type or extension for now. |
| + Type: "stale-master", |
| + // No extension for now. |
| }) |
| } |
| if elapsed < 0 { |
| @@ -337,6 +347,7 @@ func (a *Analyzer) builderAlerts(masterName string, builderName string, b *messa |
| Severity: hungBuilderSev, |
| Time: messages.TimeToEpochTime(a.Now()), |
| Links: links, |
| + Type: "hung-builder", |
| }) |
| // Note, just because it's building doesn't mean it's in a good state. If the last N builds |
| // all failed (for some large N) then this might still be alertable. |
| @@ -350,6 +361,7 @@ func (a *Analyzer) builderAlerts(masterName string, builderName string, b *messa |
| Severity: offlineBuilderSev, |
| Time: messages.TimeToEpochTime(a.Now()), |
| Links: links, |
| + Type: "offline-builder", |
| }) |
| } |
| case messages.StateIdle: |
| @@ -361,6 +373,7 @@ func (a *Analyzer) builderAlerts(masterName string, builderName string, b *messa |
| Severity: idleBuilderSev, |
| Time: messages.TimeToEpochTime(a.Now()), |
| Links: links, |
| + Type: "idle-builder", |
| }) |
| } |
| default: |
| @@ -494,7 +507,12 @@ func (a *Analyzer) mergeAlertsByStep(alerts []messages.Alert) []messages.Alert { |
| sort.Sort(byRepo(mergedBF.RegressionRanges)) |
| if len(mergedBF.Builders) > 1 { |
| - merged.Title = fmt.Sprintf("%s (failing on %d builders)", step, len(mergedBF.Builders)) |
| + merged.Title = fmt.Sprintf("%s failing on %d builders", step, len(mergedBF.Builders)) |
| + builderNames := []string{} |
| + for _, b := range mergedBF.Builders { |
| + builderNames = append(builderNames, b.Name) |
| + } |
| + merged.Body = strings.Join(builderNames, ", ") |
| } |
| merged.Extension = mergedBF |
| mergedAlerts = append(mergedAlerts, merged) |
| @@ -507,7 +525,9 @@ func (a *Analyzer) mergeAlertsByStep(alerts []messages.Alert) []messages.Alert { |
| func (a *Analyzer) GetRevisionSummaries(hashes []string) ([]messages.RevisionSummary, error) { |
| ret := []messages.RevisionSummary{} |
| for _, h := range hashes { |
| + a.rslck.Lock() |
| s, ok := a.revisionSummaries[h] |
| + a.rslck.Unlock() |
| if !ok { |
| return nil, fmt.Errorf("Unrecognized hash: %s", h) |
| } |
| @@ -714,6 +734,17 @@ func (a *Analyzer) stepFailureAlerts(failures []stepFailure) ([]messages.Alert, |
| if r, ok := failure.step.Results[0].(float64); ok && r == resInfraFailure { |
| // TODO: Create a trooper alert about this. |
| log.Errorf("INFRA FAILURE: %+v", failure) |
| + alr := messages.Alert{ |
| + Title: fmt.Sprintf("%s infra failure", failure.builderName), |
| + Body: fmt.Sprintf("On step %s", failure.step.Name), |
| + Type: "infra-failure", |
| + Severity: infraFailureSev, |
| + } |
| + rs <- res{ |
| + f: failure, |
| + a: &alr, |
| + err: nil, |
| + } |
| } |
| } |
| continue |
| @@ -721,7 +752,7 @@ func (a *Analyzer) stepFailureAlerts(failures []stepFailure) ([]messages.Alert, |
| } |
| // Check the gatekeeper configs to see if this is ignorable. |
| - if a.excludeFailure(failure.masterName, failure.builderName, failure.step.Name) { |
| + if a.Gatekeeper.ExcludeFailure(failure.masterName, failure.builderName, failure.step.Name) { |
| continue |
| } |
| @@ -741,9 +772,11 @@ func (a *Analyzer) stepFailureAlerts(failures []stepFailure) ([]messages.Alert, |
| expvars.Add("StepFailures", 1) |
| defer expvars.Add("StepFailures", -1) |
| alr := messages.Alert{ |
| - Title: fmt.Sprintf("Builder step failure: %s.%s", f.masterName, f.builderName), |
| - Time: messages.EpochTime(a.Now().Unix()), |
| - Type: "buildfailure", |
| + Title: fmt.Sprintf("%s step failure", f.builderName), |
| + Body: fmt.Sprintf("%s failing on %s/%s", f.step.Name, f.masterName, f.builderName), |
| + Time: messages.EpochTime(a.Now().Unix()), |
| + Type: "buildfailure", |
| + Severity: newFailureSev, |
| } |
| regRanges := []messages.RegressionRange{} |
| @@ -787,13 +820,23 @@ func (a *Analyzer) stepFailureAlerts(failures []stepFailure) ([]messages.Alert, |
| // change.Revision is *not* always a git hash. Sometimes it is a position from gnumbd. |
| // change.Revision is git hash or gnumbd depending on what exactly? Not obvious at this time. |
| // A potential problem here is when multiple repos have overlapping gnumbd ranges. |
| + parts := cpRE.FindAllStringSubmatch(change.Comments, -1) |
| + pos, branch := "", "" |
| + if len(parts) > 0 { |
| + branch = parts[0][1] |
| + pos = parts[0][2] |
| + } |
| + a.rslck.Lock() |
| a.revisionSummaries[change.Revision] = messages.RevisionSummary{ |
| GitHash: change.Revision, |
| Link: change.Revlink, |
| Description: trunc(change.Comments), |
| Author: change.Who, |
| When: change.When, |
| + Position: pos, |
| + Branch: branch, |
| } |
| + a.rslck.Unlock() |
| } |
| for repo, revisions := range revisionsByRepo { |
| @@ -817,10 +860,14 @@ func (a *Analyzer) stepFailureAlerts(failures []stepFailure) ([]messages.Alert, |
| LatestFailure: f.build.Number, |
| }, |
| }, |
| - TreeCloser: a.wouldCloseTree(f.masterName, f.builderName, f.step.Name), |
| + TreeCloser: a.Gatekeeper.WouldCloseTree(f.masterName, f.builderName, f.step.Name), |
| RegressionRanges: regRanges, |
| } |
| + if bf.TreeCloser { |
| + alr.Severity = treeCloserSev |
| + } |
| + |
| reasons := a.reasonsForFailure(f) |
| for _, r := range reasons { |
| bf.Reasons = append(bf.Reasons, messages.Reason{ |
| @@ -895,77 +942,6 @@ func (a *Analyzer) reasonsForFailure(f stepFailure) []string { |
| return ret |
| } |
| -func (a *Analyzer) excludeFailure(master, builder, step string) bool { |
| - mc, ok := a.MasterCfgs[master] |
| - if !ok { |
| - log.Errorf("Can't filter unknown master %s", master) |
| - return false |
| - } |
| - |
| - for _, ebName := range mc.ExcludedBuilders { |
| - if ebName == "*" || ebName == builder { |
| - return true |
| - } |
| - } |
| - |
| - // Not clear that builder_alerts even looks at the rest of these condtions |
| - // even though they're specified in gatekeeper.json |
| - for _, s := range mc.ExcludedSteps { |
| - if step == s { |
| - return true |
| - } |
| - } |
| - |
| - bc, ok := mc.Builders[builder] |
| - if !ok { |
| - if bc, ok = mc.Builders["*"]; !ok { |
| - log.Warningf("Unknown %s builder %s", master, builder) |
| - return true |
| - } |
| - } |
| - |
| - for _, esName := range bc.ExcludedSteps { |
| - if esName == step || esName == "*" { |
| - return true |
| - } |
| - } |
| - |
| - return false |
| -} |
| - |
| -func (a *Analyzer) wouldCloseTree(master, builder, step string) bool { |
| - mc, ok := a.MasterCfgs[master] |
| - if !ok { |
| - log.Errorf("Missing master cfg: %s", master) |
| - return false |
| - } |
| - bc, ok := mc.Builders[builder] |
| - if !ok { |
| - bc, ok = mc.Builders["*"] |
| - if ok { |
| - return true |
| - } |
| - } |
| - |
| - for _, xstep := range bc.ExcludedSteps { |
| - if xstep == step { |
| - return false |
| - } |
| - } |
| - |
| - csteps := []string{} |
| - csteps = append(csteps, bc.ClosingSteps...) |
| - csteps = append(csteps, bc.ClosingOptional...) |
| - |
| - for _, cs := range csteps { |
| - if cs == "*" || cs == step { |
| - return true |
| - } |
| - } |
| - |
| - return false |
| -} |
| - |
| // unexpected returns the set of expected xor actual. |
| func unexpected(expected, actual []string) []string { |
| e, a := make(map[string]bool), make(map[string]bool) |