Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(2308)

Side by Side Diff: go/src/infra/monitoring/analyzer/analyzer.go

Issue 1874053002: [alerts-dispatcher] update to build again :) also fixes a concurrent map access (Closed) Base URL: https://chromium.googlesource.com/infra/infra.git@master
Patch Set: typos Created 4 years, 8 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « no previous file | go/src/infra/monitoring/analyzer/analyzer_test.go » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright 2015 The Chromium Authors. All rights reserved. 1 // Copyright 2015 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 package analyzer 5 package analyzer
6 6
7 import ( 7 import (
8 "errors" 8 "errors"
9 "expvar" 9 "expvar"
10 "fmt" 10 "fmt"
11 "regexp"
11 "sort" 12 "sort"
12 "strings" 13 "strings"
14 "sync"
13 "time" 15 "time"
14 16
15 "github.com/luci/luci-go/common/logging/gologger" 17 "github.com/luci/luci-go/common/logging/gologger"
16 18
17 "infra/monitoring/client" 19 "infra/monitoring/client"
18 "infra/monitoring/messages" 20 "infra/monitoring/messages"
19 ) 21 )
20 22
21 const ( 23 const (
22 // StepCompletedRun is a synthetic step name used to indicate the build run is complete. 24 // StepCompletedRun is a synthetic step name used to indicate the build run is complete.
23 » StepCompletedRun = "completed run" 25 » StepCompletedRun = "completed run"
24 » treeCloserPri = 0 26
25 » reliableFailureSev = 0 27 » // Order of severity, worst to least bad.
26 » newFailureSev = 1 28 » treeCloserSev = iota
27 » staleMasterSev = 0 29 » staleMasterSev
28 » staleBuilderSev = 0 30 » infraFailureSev
29 » hungBuilderSev = 1 31 » reliableFailureSev
30 » idleBuilderSev = 1 32 » newFailureSev
31 » offlineBuilderSev = 1 33 » staleBuilderSev
32 » resOK = float64(1) 34 » hungBuilderSev
33 » resInfraFailure = float64(4) 35 » idleBuilderSev
36 » offlineBuilderSev
37
38 » // Step result values.
39 » resOK = float64(1)
40 » resInfraFailure = float64(4)
34 ) 41 )
35 42
36 var ( 43 var (
37 log = gologger.Get() 44 log = gologger.Get()
38 expvars = expvar.NewMap("analyzer") 45 expvars = expvar.NewMap("analyzer")
46 cpRE = regexp.MustCompile("Cr-Commit-Position: (.*)@{#([0-9]+)}")
39 ) 47 )
40 48
41 var ( 49 var (
42 errNoBuildSteps = errors.New("No build steps") 50 errNoBuildSteps = errors.New("No build steps")
43 errNoRecentBuilds = errors.New("No recent builds") 51 errNoRecentBuilds = errors.New("No recent builds")
44 errNoCompletedBuilds = errors.New("No completed builds") 52 errNoCompletedBuilds = errors.New("No completed builds")
45 ) 53 )
46 54
47 // StepAnalyzer reasons about a stepFailure and produces a set of reasons for th e 55 // StepAnalyzer reasons about a stepFailure and produces a set of reasons for th e
48 // failure. It also indicates whether or not it recognizes the stepFailure. 56 // failure. It also indicates whether or not it recognizes the stepFailure.
(...skipping 38 matching lines...) Expand 10 before | Expand all | Expand 10 after
87 OfflineBuilderThresh time.Duration 95 OfflineBuilderThresh time.Duration
88 96
89 // IdleBuilderCountThresh is the maximum number of builds a builder may have in queue 97 // IdleBuilderCountThresh is the maximum number of builds a builder may have in queue
90 // while in the "idle" state before triggering an "idle builder" alert. 98 // while in the "idle" state before triggering an "idle builder" alert.
91 IdleBuilderCountThresh int64 99 IdleBuilderCountThresh int64
92 100
93 // StaleMasterThreshold is the maximum age that master data from CBE can be before 101 // StaleMasterThreshold is the maximum age that master data from CBE can be before
94 // triggering a "stale master" alert. 102 // triggering a "stale master" alert.
95 StaleMasterThreshold time.Duration 103 StaleMasterThreshold time.Duration
96 104
97 » // MasterCfgs is a map of master name to MasterConfig 105 » // Gatekeeper is a the parsed gatekeeper.json config file.
98 » MasterCfgs map[string]messages.MasterConfig 106 » Gatekeeper *GatekeeperRules
99 107
100 // These limit the scope analysis, useful for debugging. 108 // These limit the scope analysis, useful for debugging.
101 MasterOnly string 109 MasterOnly string
102 BuilderOnly string 110 BuilderOnly string
103 BuildOnly int64 111 BuildOnly int64
104 112
113 // rslck protects revisionSummaries from concurrent access.
114 rslck *sync.Mutex
105 revisionSummaries map[string]messages.RevisionSummary 115 revisionSummaries map[string]messages.RevisionSummary
106 116
107 // Now is useful for mocking the system clock in testing and simulating time 117 // Now is useful for mocking the system clock in testing and simulating time
108 // during replay. 118 // during replay.
109 Now func() time.Time 119 Now func() time.Time
110 } 120 }
111 121
112 // New returns a new Analyzer. If client is nil, it assigns a default implementa tion. 122 // New returns a new Analyzer. If client is nil, it assigns a default implementa tion.
113 // maxBuilds is the maximum number of builds to check, per builder. 123 // maxBuilds is the maximum number of builds to check, per builder.
114 func New(c client.Reader, minBuilds, maxBuilds int) *Analyzer { 124 func New(c client.Reader, minBuilds, maxBuilds int) *Analyzer {
115 if c == nil { 125 if c == nil {
116 c = client.NewReader(nil) 126 c = client.NewReader(nil)
117 } 127 }
118 128
119 return &Analyzer{ 129 return &Analyzer{
120 Reader: c, 130 Reader: c,
121 MaxRecentBuilds: maxBuilds, 131 MaxRecentBuilds: maxBuilds,
122 MinRecentBuilds: minBuilds, 132 MinRecentBuilds: minBuilds,
123 HungBuilderThresh: 3 * time.Hour, 133 HungBuilderThresh: 3 * time.Hour,
124 OfflineBuilderThresh: 90 * time.Minute, 134 OfflineBuilderThresh: 90 * time.Minute,
125 IdleBuilderCountThresh: 50, 135 IdleBuilderCountThresh: 50,
126 StaleMasterThreshold: 10 * time.Minute, 136 StaleMasterThreshold: 10 * time.Minute,
127 StepAnalyzers: []StepAnalyzer{ 137 StepAnalyzers: []StepAnalyzer{
128 &TestFailureAnalyzer{Reader: c}, 138 &TestFailureAnalyzer{Reader: c},
129 &CompileFailureAnalyzer{Reader: c}, 139 &CompileFailureAnalyzer{Reader: c},
130 }, 140 },
131 » » MasterCfgs: map[string]messages.MasterConfig{}, 141 » » Gatekeeper: &GatekeeperRules{},
132 142 » » rslck: &sync.Mutex{},
133 revisionSummaries: map[string]messages.RevisionSummary{}, 143 revisionSummaries: map[string]messages.RevisionSummary{},
134 Now: func() time.Time { 144 Now: func() time.Time {
135 return time.Now() 145 return time.Now()
136 }, 146 },
137 } 147 }
138 } 148 }
139 149
140 // MasterAlerts returns alerts generated from the master. 150 // MasterAlerts returns alerts generated from the master.
141 func (a *Analyzer) MasterAlerts(master string, be *messages.BuildExtract) []mess ages.Alert { 151 func (a *Analyzer) MasterAlerts(master string, be *messages.BuildExtract) []mess ages.Alert {
142 ret := []messages.Alert{} 152 ret := []messages.Alert{}
143 153
144 // Copied logic from builder_messages. 154 // Copied logic from builder_messages.
145 // No created_timestamp should be a warning sign, no? 155 // No created_timestamp should be a warning sign, no?
146 if be.CreatedTimestamp == messages.EpochTime(0) { 156 if be.CreatedTimestamp == messages.EpochTime(0) {
147 return ret 157 return ret
148 } 158 }
149 expvars.Add("MasterAlerts", 1) 159 expvars.Add("MasterAlerts", 1)
150 defer expvars.Add("MasterAlerts", -1) 160 defer expvars.Add("MasterAlerts", -1)
151 elapsed := a.Now().Sub(be.CreatedTimestamp.Time()) 161 elapsed := a.Now().Sub(be.CreatedTimestamp.Time())
152 if elapsed > a.StaleMasterThreshold { 162 if elapsed > a.StaleMasterThreshold {
153 ret = append(ret, messages.Alert{ 163 ret = append(ret, messages.Alert{
154 Key: fmt.Sprintf("stale master: %v", master), 164 Key: fmt.Sprintf("stale master: %v", master),
155 Title: fmt.Sprintf("Stale %s master data", master), 165 Title: fmt.Sprintf("Stale %s master data", master),
156 » » » Body: fmt.Sprintf("%s elapsed since last update.", elapsed), 166 » » » Body: fmt.Sprintf("%dh %2dm elapsed since last upda te.", int(elapsed.Hours()), int(elapsed.Minutes())),
157 StartTime: messages.TimeToEpochTime(be.CreatedTimestamp. Time()), 167 StartTime: messages.TimeToEpochTime(be.CreatedTimestamp. Time()),
158 Severity: staleMasterSev, 168 Severity: staleMasterSev,
159 Time: messages.TimeToEpochTime(a.Now()), 169 Time: messages.TimeToEpochTime(a.Now()),
160 Links: []messages.Link{{"Master", client.MasterURL(m aster)}}, 170 Links: []messages.Link{{"Master", client.MasterURL(m aster)}},
161 » » » // No type or extension for now. 171 » » » Type: "stale-master",
172 » » » // No extension for now.
162 }) 173 })
163 } 174 }
164 if elapsed < 0 { 175 if elapsed < 0 {
165 // Add this to the alerts returned, rather than just log it? 176 // Add this to the alerts returned, rather than just log it?
166 log.Errorf("Master %s timestamp is newer than current time (%s): %s old.", master, a.Now(), elapsed) 177 log.Errorf("Master %s timestamp is newer than current time (%s): %s old.", master, a.Now(), elapsed)
167 } 178 }
168 179
169 return ret 180 return ret
170 } 181 }
171 182
(...skipping 158 matching lines...) Expand 10 before | Expand all | Expand 10 after
330 switch b.State { 341 switch b.State {
331 case messages.StateBuilding: 342 case messages.StateBuilding:
332 if elapsed > a.HungBuilderThresh && lastStep != StepCompletedRun { 343 if elapsed > a.HungBuilderThresh && lastStep != StepCompletedRun {
333 alerts = append(alerts, messages.Alert{ 344 alerts = append(alerts, messages.Alert{
334 Key: fmt.Sprintf("%s.%s.hung", masterName, builderName), 345 Key: fmt.Sprintf("%s.%s.hung", masterName, builderName),
335 Title: fmt.Sprintf("%s.%s is hung in step %s. ", masterName, builderName, lastStep), 346 Title: fmt.Sprintf("%s.%s is hung in step %s. ", masterName, builderName, lastStep),
336 Body: fmt.Sprintf("%s.%s has been building f or %v (last step update %s), past the alerting threshold of %v", masterName, bui lderName, elapsed, lastUpdated.Time(), a.HungBuilderThresh), 347 Body: fmt.Sprintf("%s.%s has been building f or %v (last step update %s), past the alerting threshold of %v", masterName, bui lderName, elapsed, lastUpdated.Time(), a.HungBuilderThresh),
337 Severity: hungBuilderSev, 348 Severity: hungBuilderSev,
338 Time: messages.TimeToEpochTime(a.Now()), 349 Time: messages.TimeToEpochTime(a.Now()),
339 Links: links, 350 Links: links,
351 Type: "hung-builder",
340 }) 352 })
341 // Note, just because it's building doesn't mean it's in a good state. If the last N builds 353 // Note, just because it's building doesn't mean it's in a good state. If the last N builds
342 // all failed (for some large N) then this might still b e alertable. 354 // all failed (for some large N) then this might still b e alertable.
343 } 355 }
344 case messages.StateOffline: 356 case messages.StateOffline:
345 if elapsed > a.OfflineBuilderThresh { 357 if elapsed > a.OfflineBuilderThresh {
346 alerts = append(alerts, messages.Alert{ 358 alerts = append(alerts, messages.Alert{
347 Key: fmt.Sprintf("%s.%s.offline", masterNam e, builderName), 359 Key: fmt.Sprintf("%s.%s.offline", masterNam e, builderName),
348 Title: fmt.Sprintf("%s.%s is offline.", maste rName, builderName), 360 Title: fmt.Sprintf("%s.%s is offline.", maste rName, builderName),
349 Body: fmt.Sprintf("%s.%s has been offline fo r %v (last step update %s %v), past the alerting threshold of %v", masterName, b uilderName, elapsed, lastUpdated.Time(), float64(lastUpdated), a.OfflineBuilderT hresh), 361 Body: fmt.Sprintf("%s.%s has been offline fo r %v (last step update %s %v), past the alerting threshold of %v", masterName, b uilderName, elapsed, lastUpdated.Time(), float64(lastUpdated), a.OfflineBuilderT hresh),
350 Severity: offlineBuilderSev, 362 Severity: offlineBuilderSev,
351 Time: messages.TimeToEpochTime(a.Now()), 363 Time: messages.TimeToEpochTime(a.Now()),
352 Links: links, 364 Links: links,
365 Type: "offline-builder",
353 }) 366 })
354 } 367 }
355 case messages.StateIdle: 368 case messages.StateIdle:
356 if b.PendingBuilds > a.IdleBuilderCountThresh { 369 if b.PendingBuilds > a.IdleBuilderCountThresh {
357 alerts = append(alerts, messages.Alert{ 370 alerts = append(alerts, messages.Alert{
358 Key: fmt.Sprintf("%s.%s.idle", masterName, builderName), 371 Key: fmt.Sprintf("%s.%s.idle", masterName, builderName),
359 Title: fmt.Sprintf("%s.%s is idle with too ma ny pending builds.", masterName, builderName), 372 Title: fmt.Sprintf("%s.%s is idle with too ma ny pending builds.", masterName, builderName),
360 Body: fmt.Sprintf("%s.%s is idle with %d pen ding builds, past the alerting threshold of %d", masterName, builderName, b.Pend ingBuilds, a.IdleBuilderCountThresh), 373 Body: fmt.Sprintf("%s.%s is idle with %d pen ding builds, past the alerting threshold of %d", masterName, builderName, b.Pend ingBuilds, a.IdleBuilderCountThresh),
361 Severity: idleBuilderSev, 374 Severity: idleBuilderSev,
362 Time: messages.TimeToEpochTime(a.Now()), 375 Time: messages.TimeToEpochTime(a.Now()),
363 Links: links, 376 Links: links,
377 Type: "idle-builder",
364 }) 378 })
365 } 379 }
366 default: 380 default:
367 log.Errorf("Unknown %s.%s builder state: %s", masterName, builde rName, b.State) 381 log.Errorf("Unknown %s.%s builder state: %s", masterName, builde rName, b.State)
368 } 382 }
369 383
370 // Check for alerts on the most recent complete build 384 // Check for alerts on the most recent complete build
371 log.Infof("Checking %d most recent builds for alertable step failures: % s/%s", len(recentBuildIDs), masterName, builderName) 385 log.Infof("Checking %d most recent builds for alertable step failures: % s/%s", len(recentBuildIDs), masterName, builderName)
372 as, es := a.builderStepAlerts(masterName, builderName, []int64{lastCompl etedBuild.Number}) 386 as, es := a.builderStepAlerts(masterName, builderName, []int64{lastCompl etedBuild.Number})
373 387
(...skipping 113 matching lines...) Expand 10 before | Expand all | Expand 10 after
487 mergedBF.RegressionRanges = append(mergedBF.RegressionRa nges, messages.RegressionRange{ 501 mergedBF.RegressionRanges = append(mergedBF.RegressionRa nges, messages.RegressionRange{
488 Repo: repo, 502 Repo: repo,
489 Positions: uniques(pos), 503 Positions: uniques(pos),
490 Revisions: uniques(revs), 504 Revisions: uniques(revs),
491 }) 505 })
492 } 506 }
493 507
494 sort.Sort(byRepo(mergedBF.RegressionRanges)) 508 sort.Sort(byRepo(mergedBF.RegressionRanges))
495 509
496 if len(mergedBF.Builders) > 1 { 510 if len(mergedBF.Builders) > 1 {
497 » » » merged.Title = fmt.Sprintf("%s (failing on %d builders)" , step, len(mergedBF.Builders)) 511 » » » merged.Title = fmt.Sprintf("%s failing on %d builders", step, len(mergedBF.Builders))
512 » » » builderNames := []string{}
513 » » » for _, b := range mergedBF.Builders {
514 » » » » builderNames = append(builderNames, b.Name)
515 » » » }
516 » » » merged.Body = strings.Join(builderNames, ", ")
498 } 517 }
499 merged.Extension = mergedBF 518 merged.Extension = mergedBF
500 mergedAlerts = append(mergedAlerts, merged) 519 mergedAlerts = append(mergedAlerts, merged)
501 } 520 }
502 521
503 return mergedAlerts 522 return mergedAlerts
504 } 523 }
505 524
506 // GetRevisionSummaries returns a slice of RevisionSummaries for the list of has hes. 525 // GetRevisionSummaries returns a slice of RevisionSummaries for the list of has hes.
507 func (a *Analyzer) GetRevisionSummaries(hashes []string) ([]messages.RevisionSum mary, error) { 526 func (a *Analyzer) GetRevisionSummaries(hashes []string) ([]messages.RevisionSum mary, error) {
508 ret := []messages.RevisionSummary{} 527 ret := []messages.RevisionSummary{}
509 for _, h := range hashes { 528 for _, h := range hashes {
529 a.rslck.Lock()
510 s, ok := a.revisionSummaries[h] 530 s, ok := a.revisionSummaries[h]
531 a.rslck.Unlock()
511 if !ok { 532 if !ok {
512 return nil, fmt.Errorf("Unrecognized hash: %s", h) 533 return nil, fmt.Errorf("Unrecognized hash: %s", h)
513 } 534 }
514 ret = append(ret, s) 535 ret = append(ret, s)
515 } 536 }
516 537
517 return ret, nil 538 return ret, nil
518 } 539 }
519 540
520 // builderStepAlerts scans the steps of recent builds done on a particular build er, 541 // builderStepAlerts scans the steps of recent builds done on a particular build er,
(...skipping 186 matching lines...) Expand 10 before | Expand all | Expand 10 after
707 // goroutine/channel because the reasonsForFailure call potentia lly 728 // goroutine/channel because the reasonsForFailure call potentia lly
708 // blocks on IO. 729 // blocks on IO.
709 if failure.step.Name == "steps" { 730 if failure.step.Name == "steps" {
710 // check results to see if it's an array of [4] 731 // check results to see if it's an array of [4]
711 // That's a purple failure, which should go to infra/tro oper. 732 // That's a purple failure, which should go to infra/tro oper.
712 log.Infof("steps results: %+v", failure.step) 733 log.Infof("steps results: %+v", failure.step)
713 if len(failure.step.Results) > 0 { 734 if len(failure.step.Results) > 0 {
714 if r, ok := failure.step.Results[0].(float64); o k && r == resInfraFailure { 735 if r, ok := failure.step.Results[0].(float64); o k && r == resInfraFailure {
715 // TODO: Create a trooper alert about th is. 736 // TODO: Create a trooper alert about th is.
716 log.Errorf("INFRA FAILURE: %+v", failure ) 737 log.Errorf("INFRA FAILURE: %+v", failure )
738 alr := messages.Alert{
739 Title: fmt.Sprintf("%s infra failure", failure.builderName),
740 Body: fmt.Sprintf("On step % s", failure.step.Name),
741 Type: "infra-failure",
742 Severity: infraFailureSev,
743 }
744 rs <- res{
745 f: failure,
746 a: &alr,
747 err: nil,
748 }
717 } 749 }
718 } 750 }
719 continue 751 continue
720 // The actual breaking step will appear later. 752 // The actual breaking step will appear later.
721 } 753 }
722 754
723 // Check the gatekeeper configs to see if this is ignorable. 755 // Check the gatekeeper configs to see if this is ignorable.
724 » » if a.excludeFailure(failure.masterName, failure.builderName, fai lure.step.Name) { 756 » » if a.Gatekeeper.ExcludeFailure(failure.masterName, failure.build erName, failure.step.Name) {
725 continue 757 continue
726 } 758 }
727 759
728 // Gets the named revision number from gnumbd metadata. 760 // Gets the named revision number from gnumbd metadata.
729 getCommitPos := func(b messages.Build, name string) (string, boo l) { 761 getCommitPos := func(b messages.Build, name string) (string, boo l) {
730 for _, p := range b.Properties { 762 for _, p := range b.Properties {
731 if p[0] == name { 763 if p[0] == name {
732 s, ok := p[1].(string) 764 s, ok := p[1].(string)
733 return s, ok 765 return s, ok
734 } 766 }
735 } 767 }
736 return "", false 768 return "", false
737 } 769 }
738 770
739 scannedFailures = append(scannedFailures, failure) 771 scannedFailures = append(scannedFailures, failure)
740 go func(f stepFailure) { 772 go func(f stepFailure) {
741 expvars.Add("StepFailures", 1) 773 expvars.Add("StepFailures", 1)
742 defer expvars.Add("StepFailures", -1) 774 defer expvars.Add("StepFailures", -1)
743 alr := messages.Alert{ 775 alr := messages.Alert{
744 » » » » Title: fmt.Sprintf("Builder step failure: %s.%s" , f.masterName, f.builderName), 776 » » » » Title: fmt.Sprintf("%s step failure", f.build erName),
745 » » » » Time: messages.EpochTime(a.Now().Unix()), 777 » » » » Body: fmt.Sprintf("%s failing on %s/%s", f.s tep.Name, f.masterName, f.builderName),
746 » » » » Type: "buildfailure", 778 » » » » Time: messages.EpochTime(a.Now().Unix()),
779 » » » » Type: "buildfailure",
780 » » » » Severity: newFailureSev,
747 } 781 }
748 782
749 regRanges := []messages.RegressionRange{} 783 regRanges := []messages.RegressionRange{}
750 revisionsByRepo := map[string][]string{} 784 revisionsByRepo := map[string][]string{}
751 785
752 // Get gnumbd sequence numbers for whatever this build p ulled in. 786 // Get gnumbd sequence numbers for whatever this build p ulled in.
753 chromiumPos, ok := getCommitPos(f.build, "got_revision_c p") 787 chromiumPos, ok := getCommitPos(f.build, "got_revision_c p")
754 if ok { 788 if ok {
755 regRanges = append(regRanges, messages.Regressio nRange{ 789 regRanges = append(regRanges, messages.Regressio nRange{
756 Repo: "chromium", 790 Repo: "chromium",
(...skipping 23 matching lines...) Expand all
780 Repo: "nacl", 814 Repo: "nacl",
781 Positions: []string{naclPos}, 815 Positions: []string{naclPos},
782 }) 816 })
783 } 817 }
784 818
785 for _, change := range f.build.SourceStamp.Changes { 819 for _, change := range f.build.SourceStamp.Changes {
786 revisionsByRepo[change.Repository] = append(revi sionsByRepo[change.Repository], change.Revision) 820 revisionsByRepo[change.Repository] = append(revi sionsByRepo[change.Repository], change.Revision)
787 // change.Revision is *not* always a git hash. S ometimes it is a position from gnumbd. 821 // change.Revision is *not* always a git hash. S ometimes it is a position from gnumbd.
788 // change.Revision is git hash or gnumbd dependi ng on what exactly? Not obvious at this time. 822 // change.Revision is git hash or gnumbd dependi ng on what exactly? Not obvious at this time.
789 // A potential problem here is when multiple rep os have overlapping gnumbd ranges. 823 // A potential problem here is when multiple rep os have overlapping gnumbd ranges.
824 parts := cpRE.FindAllStringSubmatch(change.Comme nts, -1)
825 pos, branch := "", ""
826 if len(parts) > 0 {
827 branch = parts[0][1]
828 pos = parts[0][2]
829 }
830 a.rslck.Lock()
790 a.revisionSummaries[change.Revision] = messages. RevisionSummary{ 831 a.revisionSummaries[change.Revision] = messages. RevisionSummary{
791 GitHash: change.Revision, 832 GitHash: change.Revision,
792 Link: change.Revlink, 833 Link: change.Revlink,
793 Description: trunc(change.Comments), 834 Description: trunc(change.Comments),
794 Author: change.Who, 835 Author: change.Who,
795 When: change.When, 836 When: change.When,
837 Position: pos,
838 Branch: branch,
796 } 839 }
840 a.rslck.Unlock()
797 } 841 }
798 842
799 for repo, revisions := range revisionsByRepo { 843 for repo, revisions := range revisionsByRepo {
800 regRanges = append(regRanges, messages.Regressio nRange{ 844 regRanges = append(regRanges, messages.Regressio nRange{
801 Repo: repo, 845 Repo: repo,
802 Revisions: revisions, 846 Revisions: revisions,
803 }) 847 })
804 } 848 }
805 849
806 // If the builder has been failing on the same step for multiple builds in a row, 850 // If the builder has been failing on the same step for multiple builds in a row,
807 // we should have only one alert but indicate the range of builds affected. 851 // we should have only one alert but indicate the range of builds affected.
808 // These are set in FirstFailure and LastFailure. 852 // These are set in FirstFailure and LastFailure.
809 bf := messages.BuildFailure{ 853 bf := messages.BuildFailure{
810 // FIXME: group builders? 854 // FIXME: group builders?
811 Builders: []messages.AlertedBuilder{ 855 Builders: []messages.AlertedBuilder{
812 { 856 {
813 Name: f.builderName, 857 Name: f.builderName,
814 URL: client.BuilderURL (f.masterName, f.builderName), 858 URL: client.BuilderURL (f.masterName, f.builderName),
815 StartTime: f.build.CreatedTi mestamp, 859 StartTime: f.build.CreatedTi mestamp,
816 FirstFailure: f.build.Number, 860 FirstFailure: f.build.Number,
817 LatestFailure: f.build.Number, 861 LatestFailure: f.build.Number,
818 }, 862 },
819 }, 863 },
820 » » » » TreeCloser: a.wouldCloseTree(f.masterName, f.builderName, f.step.Name), 864 » » » » TreeCloser: a.Gatekeeper.WouldCloseTree(f. masterName, f.builderName, f.step.Name),
821 RegressionRanges: regRanges, 865 RegressionRanges: regRanges,
822 } 866 }
823 867
868 if bf.TreeCloser {
869 alr.Severity = treeCloserSev
870 }
871
824 reasons := a.reasonsForFailure(f) 872 reasons := a.reasonsForFailure(f)
825 for _, r := range reasons { 873 for _, r := range reasons {
826 bf.Reasons = append(bf.Reasons, messages.Reason{ 874 bf.Reasons = append(bf.Reasons, messages.Reason{
827 TestName: r, 875 TestName: r,
828 Step: f.step.Name, 876 Step: f.step.Name,
829 URL: f.URL(), 877 URL: f.URL(),
830 }) 878 })
831 } 879 }
832 880
833 alr.Key = alertKey(f.masterName, f.builderName, f.step.N ame) 881 alr.Key = alertKey(f.masterName, f.builderName, f.step.N ame)
(...skipping 54 matching lines...) Expand 10 before | Expand all | Expand 10 after
888 936
889 if !recognized { 937 if !recognized {
890 // TODO: log and report frequently encountered unrecognized buil der step 938 // TODO: log and report frequently encountered unrecognized buil der step
891 // failure names. 939 // failure names.
892 log.Errorf("Unrecognized step step failure type, unable to find reasons: %s", f.step.Name) 940 log.Errorf("Unrecognized step step failure type, unable to find reasons: %s", f.step.Name)
893 } 941 }
894 942
895 return ret 943 return ret
896 } 944 }
897 945
898 func (a *Analyzer) excludeFailure(master, builder, step string) bool {
899 mc, ok := a.MasterCfgs[master]
900 if !ok {
901 log.Errorf("Can't filter unknown master %s", master)
902 return false
903 }
904
905 for _, ebName := range mc.ExcludedBuilders {
906 if ebName == "*" || ebName == builder {
907 return true
908 }
909 }
910
911 // Not clear that builder_alerts even looks at the rest of these condtio ns
912 // even though they're specified in gatekeeper.json
913 for _, s := range mc.ExcludedSteps {
914 if step == s {
915 return true
916 }
917 }
918
919 bc, ok := mc.Builders[builder]
920 if !ok {
921 if bc, ok = mc.Builders["*"]; !ok {
922 log.Warningf("Unknown %s builder %s", master, builder)
923 return true
924 }
925 }
926
927 for _, esName := range bc.ExcludedSteps {
928 if esName == step || esName == "*" {
929 return true
930 }
931 }
932
933 return false
934 }
935
936 func (a *Analyzer) wouldCloseTree(master, builder, step string) bool {
937 mc, ok := a.MasterCfgs[master]
938 if !ok {
939 log.Errorf("Missing master cfg: %s", master)
940 return false
941 }
942 bc, ok := mc.Builders[builder]
943 if !ok {
944 bc, ok = mc.Builders["*"]
945 if ok {
946 return true
947 }
948 }
949
950 for _, xstep := range bc.ExcludedSteps {
951 if xstep == step {
952 return false
953 }
954 }
955
956 csteps := []string{}
957 csteps = append(csteps, bc.ClosingSteps...)
958 csteps = append(csteps, bc.ClosingOptional...)
959
960 for _, cs := range csteps {
961 if cs == "*" || cs == step {
962 return true
963 }
964 }
965
966 return false
967 }
968
969 // unexpected returns the set of expected xor actual. 946 // unexpected returns the set of expected xor actual.
970 func unexpected(expected, actual []string) []string { 947 func unexpected(expected, actual []string) []string {
971 e, a := make(map[string]bool), make(map[string]bool) 948 e, a := make(map[string]bool), make(map[string]bool)
972 for _, s := range expected { 949 for _, s := range expected {
973 e[s] = true 950 e[s] = true
974 } 951 }
975 for _, s := range actual { 952 for _, s := range actual {
976 a[s] = true 953 a[s] = true
977 } 954 }
978 955
(...skipping 17 matching lines...) Expand all
996 masterName string 973 masterName string
997 builderName string 974 builderName string
998 build messages.Build 975 build messages.Build
999 step messages.Step 976 step messages.Step
1000 } 977 }
1001 978
1002 // URL returns a url to builder step failure page. 979 // URL returns a url to builder step failure page.
1003 func (f stepFailure) URL() string { 980 func (f stepFailure) URL() string {
1004 return client.StepURL(f.masterName, f.builderName, f.step.Name, f.build. Number) 981 return client.StepURL(f.masterName, f.builderName, f.step.Name, f.build. Number)
1005 } 982 }
OLDNEW
« no previous file with comments | « no previous file | go/src/infra/monitoring/analyzer/analyzer_test.go » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698