Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(437)

Side by Side Diff: go/src/infra/monitoring/analyzer/analyzer.go

Issue 1874053002: [alerts-dispatcher] update to build again :) also fixes a concurrent map access (Closed) Base URL: https://chromium.googlesource.com/infra/infra.git@master
Patch Set: merge to master, fix tests Created 4 years, 8 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 // Copyright 2015 The Chromium Authors. All rights reserved. 1 // Copyright 2015 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 package analyzer 5 package analyzer
6 6
7 import ( 7 import (
8 "errors" 8 "errors"
9 "expvar" 9 "expvar"
10 "fmt" 10 "fmt"
11 "regexp"
11 "sort" 12 "sort"
12 "strings" 13 "strings"
14 "sync"
13 "time" 15 "time"
14 16
15 "github.com/luci/luci-go/common/logging/gologger" 17 "github.com/luci/luci-go/common/logging/gologger"
16 18
17 "infra/monitoring/client" 19 "infra/monitoring/client"
18 "infra/monitoring/messages" 20 "infra/monitoring/messages"
19 ) 21 )
20 22
21 const ( 23 const (
22 // StepCompletedRun is a synthetic step name used to indicate the build run is complete. 24 // StepCompletedRun is a synthetic step name used to indicate the build run is complete.
23 » StepCompletedRun = "completed run" 25 » StepCompletedRun = "completed run"
24 » treeCloserPri = 0 26
25 » reliableFailureSev = 0 27 » // Order of severity, worst to least bad.
26 » newFailureSev = 1 28 » treeCloserSev = iota
27 » staleMasterSev = 0 29 » staleMasterSev
28 » staleBuilderSev = 0 30 » infraFailureSev
29 » hungBuilderSev = 1 31 » reliableFailureSev
30 » idleBuilderSev = 1 32 » newFailureSev
31 » offlineBuilderSev = 1 33 » staleBuilderSev
32 » resOK = float64(1) 34 » hungBuilderSev
33 » resInfraFailure = float64(4) 35 » idleBuilderSev
36 » offlineBuilderSev
37
38 » // Step result values.
39 » resOK = float64(1)
40 » resInfraFailure = float64(4)
34 ) 41 )
35 42
36 var ( 43 var (
37 log = gologger.Get() 44 log = gologger.Get()
38 expvars = expvar.NewMap("analyzer") 45 expvars = expvar.NewMap("analyzer")
46 cpRE = regexp.MustCompile("Cr-Commit-Position: (.*)@{#([0-9]+)}")
39 ) 47 )
40 48
41 var ( 49 var (
42 errNoBuildSteps = errors.New("No build steps") 50 errNoBuildSteps = errors.New("No build steps")
43 errNoRecentBuilds = errors.New("No recent builds") 51 errNoRecentBuilds = errors.New("No recent builds")
44 errNoCompletedBuilds = errors.New("No completed builds") 52 errNoCompletedBuilds = errors.New("No completed builds")
45 ) 53 )
46 54
47 // StepAnalyzer reasons about a stepFailure and produces a set of reasons for th e 55 // StepAnalyzer reasons about a stepFailure and produces a set of reasons for th e
48 // failure. It also indicates whether or not it recognizes the stepFailure. 56 // failure. It also indicates whether or not it recognizes the stepFailure.
(...skipping 38 matching lines...) Expand 10 before | Expand all | Expand 10 after
87 OfflineBuilderThresh time.Duration 95 OfflineBuilderThresh time.Duration
88 96
89 // IdleBuilderCountThresh is the maximum number of builds a builder may have in queue 97 // IdleBuilderCountThresh is the maximum number of builds a builder may have in queue
90 // while in the "idle" state before triggering an "idle builder" alert. 98 // while in the "idle" state before triggering an "idle builder" alert.
91 IdleBuilderCountThresh int64 99 IdleBuilderCountThresh int64
92 100
93 // StaleMasterThreshold is the maximum age that master data from CBE can be before 101 // StaleMasterThreshold is the maximum age that master data from CBE can be before
94 // triggering a "stale master" alert. 102 // triggering a "stale master" alert.
95 StaleMasterThreshold time.Duration 103 StaleMasterThreshold time.Duration
96 104
97 » // MasterCfgs is a map of master name to MasterConfig 105 » // Gatekeeper is a the parsed gatekeeper.json config file.
98 » MasterCfgs map[string]messages.MasterConfig 106 » Gatekeeper *GatekeeperRules
99 107
100 // These limit the scope analysis, useful for debugging. 108 // These limit the scope analysis, useful for debugging.
101 MasterOnly string 109 MasterOnly string
102 BuilderOnly string 110 BuilderOnly string
103 BuildOnly int64 111 BuildOnly int64
104 112
113 rslck *sync.Mutex
martiniss 2016/04/12 22:02:54 could you put a comment saying what resources this
105 revisionSummaries map[string]messages.RevisionSummary 114 revisionSummaries map[string]messages.RevisionSummary
106 115
107 // Now is useful for mocking the system clock in testing and simulating time 116 // Now is useful for mocking the system clock in testing and simulating time
108 // during replay. 117 // during replay.
109 Now func() time.Time 118 Now func() time.Time
110 } 119 }
111 120
112 // New returns a new Analyzer. If client is nil, it assigns a default implementa tion. 121 // New returns a new Analyzer. If client is nil, it assigns a default implementa tion.
113 // maxBuilds is the maximum number of builds to check, per builder. 122 // maxBuilds is the maximum number of builds to check, per builder.
114 func New(c client.Reader, minBuilds, maxBuilds int) *Analyzer { 123 func New(c client.Reader, minBuilds, maxBuilds int) *Analyzer {
115 if c == nil { 124 if c == nil {
116 c = client.NewReader(nil) 125 c = client.NewReader(nil)
117 } 126 }
118 127
119 return &Analyzer{ 128 return &Analyzer{
120 Reader: c, 129 Reader: c,
121 MaxRecentBuilds: maxBuilds, 130 MaxRecentBuilds: maxBuilds,
122 MinRecentBuilds: minBuilds, 131 MinRecentBuilds: minBuilds,
123 HungBuilderThresh: 3 * time.Hour, 132 HungBuilderThresh: 3 * time.Hour,
124 OfflineBuilderThresh: 90 * time.Minute, 133 OfflineBuilderThresh: 90 * time.Minute,
125 IdleBuilderCountThresh: 50, 134 IdleBuilderCountThresh: 50,
126 StaleMasterThreshold: 10 * time.Minute, 135 StaleMasterThreshold: 10 * time.Minute,
127 StepAnalyzers: []StepAnalyzer{ 136 StepAnalyzers: []StepAnalyzer{
128 &TestFailureAnalyzer{Reader: c}, 137 &TestFailureAnalyzer{Reader: c},
129 &CompileFailureAnalyzer{Reader: c}, 138 &CompileFailureAnalyzer{Reader: c},
130 }, 139 },
131 » » MasterCfgs: map[string]messages.MasterConfig{}, 140 » » Gatekeeper: &GatekeeperRules{},
132 141 » » rslck: &sync.Mutex{},
martiniss 2016/04/12 22:02:54 could you put a comment saying what resources this
seanmccullough 2016/04/12 22:13:42 Done.
133 revisionSummaries: map[string]messages.RevisionSummary{}, 142 revisionSummaries: map[string]messages.RevisionSummary{},
134 Now: func() time.Time { 143 Now: func() time.Time {
135 return time.Now() 144 return time.Now()
136 }, 145 },
137 } 146 }
138 } 147 }
139 148
140 // MasterAlerts returns alerts generated from the master. 149 // MasterAlerts returns alerts generated from the master.
141 func (a *Analyzer) MasterAlerts(master string, be *messages.BuildExtract) []mess ages.Alert { 150 func (a *Analyzer) MasterAlerts(master string, be *messages.BuildExtract) []mess ages.Alert {
142 ret := []messages.Alert{} 151 ret := []messages.Alert{}
143 152
144 // Copied logic from builder_messages. 153 // Copied logic from builder_messages.
145 // No created_timestamp should be a warning sign, no? 154 // No created_timestamp should be a warning sign, no?
146 if be.CreatedTimestamp == messages.EpochTime(0) { 155 if be.CreatedTimestamp == messages.EpochTime(0) {
147 return ret 156 return ret
148 } 157 }
149 expvars.Add("MasterAlerts", 1) 158 expvars.Add("MasterAlerts", 1)
150 defer expvars.Add("MasterAlerts", -1) 159 defer expvars.Add("MasterAlerts", -1)
151 elapsed := a.Now().Sub(be.CreatedTimestamp.Time()) 160 elapsed := a.Now().Sub(be.CreatedTimestamp.Time())
152 if elapsed > a.StaleMasterThreshold { 161 if elapsed > a.StaleMasterThreshold {
153 ret = append(ret, messages.Alert{ 162 ret = append(ret, messages.Alert{
154 Key: fmt.Sprintf("stale master: %v", master), 163 Key: fmt.Sprintf("stale master: %v", master),
155 Title: fmt.Sprintf("Stale %s master data", master), 164 Title: fmt.Sprintf("Stale %s master data", master),
156 » » » Body: fmt.Sprintf("%s elapsed since last update.", elapsed), 165 » » » Body: fmt.Sprintf("%dh %2dm elapsed since last upda te.", int(elapsed.Hours()), int(elapsed.Minutes())),
157 StartTime: messages.TimeToEpochTime(be.CreatedTimestamp. Time()), 166 StartTime: messages.TimeToEpochTime(be.CreatedTimestamp. Time()),
158 Severity: staleMasterSev, 167 Severity: staleMasterSev,
159 Time: messages.TimeToEpochTime(a.Now()), 168 Time: messages.TimeToEpochTime(a.Now()),
160 Links: []messages.Link{{"Master", client.MasterURL(m aster)}}, 169 Links: []messages.Link{{"Master", client.MasterURL(m aster)}},
161 » » » // No type or extension for now. 170 » » » Type: "stale-master",
171 » » » // No extension for now.
162 }) 172 })
163 } 173 }
164 if elapsed < 0 { 174 if elapsed < 0 {
165 // Add this to the alerts returned, rather than just log it? 175 // Add this to the alerts returned, rather than just log it?
166 log.Errorf("Master %s timestamp is newer than current time (%s): %s old.", master, a.Now(), elapsed) 176 log.Errorf("Master %s timestamp is newer than current time (%s): %s old.", master, a.Now(), elapsed)
167 } 177 }
168 178
169 return ret 179 return ret
170 } 180 }
171 181
(...skipping 158 matching lines...) Expand 10 before | Expand all | Expand 10 after
330 switch b.State { 340 switch b.State {
331 case messages.StateBuilding: 341 case messages.StateBuilding:
332 if elapsed > a.HungBuilderThresh && lastStep != StepCompletedRun { 342 if elapsed > a.HungBuilderThresh && lastStep != StepCompletedRun {
333 alerts = append(alerts, messages.Alert{ 343 alerts = append(alerts, messages.Alert{
334 Key: fmt.Sprintf("%s.%s.hung", masterName, builderName), 344 Key: fmt.Sprintf("%s.%s.hung", masterName, builderName),
335 Title: fmt.Sprintf("%s.%s is hung in step %s. ", masterName, builderName, lastStep), 345 Title: fmt.Sprintf("%s.%s is hung in step %s. ", masterName, builderName, lastStep),
336 Body: fmt.Sprintf("%s.%s has been building f or %v (last step update %s), past the alerting threshold of %v", masterName, bui lderName, elapsed, lastUpdated.Time(), a.HungBuilderThresh), 346 Body: fmt.Sprintf("%s.%s has been building f or %v (last step update %s), past the alerting threshold of %v", masterName, bui lderName, elapsed, lastUpdated.Time(), a.HungBuilderThresh),
337 Severity: hungBuilderSev, 347 Severity: hungBuilderSev,
338 Time: messages.TimeToEpochTime(a.Now()), 348 Time: messages.TimeToEpochTime(a.Now()),
339 Links: links, 349 Links: links,
350 Type: "hung-builder",
340 }) 351 })
341 // Note, just because it's building doesn't mean it's in a good state. If the last N builds 352 // Note, just because it's building doesn't mean it's in a good state. If the last N builds
342 // all failed (for some large N) then this might still b e alertable. 353 // all failed (for some large N) then this might still b e alertable.
343 } 354 }
344 case messages.StateOffline: 355 case messages.StateOffline:
345 if elapsed > a.OfflineBuilderThresh { 356 if elapsed > a.OfflineBuilderThresh {
346 alerts = append(alerts, messages.Alert{ 357 alerts = append(alerts, messages.Alert{
347 Key: fmt.Sprintf("%s.%s.offline", masterNam e, builderName), 358 Key: fmt.Sprintf("%s.%s.offline", masterNam e, builderName),
348 Title: fmt.Sprintf("%s.%s is offline.", maste rName, builderName), 359 Title: fmt.Sprintf("%s.%s is offline.", maste rName, builderName),
349 Body: fmt.Sprintf("%s.%s has been offline fo r %v (last step update %s %v), past the alerting threshold of %v", masterName, b uilderName, elapsed, lastUpdated.Time(), float64(lastUpdated), a.OfflineBuilderT hresh), 360 Body: fmt.Sprintf("%s.%s has been offline fo r %v (last step update %s %v), past the alerting threshold of %v", masterName, b uilderName, elapsed, lastUpdated.Time(), float64(lastUpdated), a.OfflineBuilderT hresh),
350 Severity: offlineBuilderSev, 361 Severity: offlineBuilderSev,
351 Time: messages.TimeToEpochTime(a.Now()), 362 Time: messages.TimeToEpochTime(a.Now()),
352 Links: links, 363 Links: links,
364 Type: "offline-builder",
353 }) 365 })
354 } 366 }
355 case messages.StateIdle: 367 case messages.StateIdle:
356 if b.PendingBuilds > a.IdleBuilderCountThresh { 368 if b.PendingBuilds > a.IdleBuilderCountThresh {
357 alerts = append(alerts, messages.Alert{ 369 alerts = append(alerts, messages.Alert{
358 Key: fmt.Sprintf("%s.%s.idle", masterName, builderName), 370 Key: fmt.Sprintf("%s.%s.idle", masterName, builderName),
359 Title: fmt.Sprintf("%s.%s is idle with too ma ny pending builds.", masterName, builderName), 371 Title: fmt.Sprintf("%s.%s is idle with too ma ny pending builds.", masterName, builderName),
360 Body: fmt.Sprintf("%s.%s is idle with %d pen ding builds, past the alerting threshold of %d", masterName, builderName, b.Pend ingBuilds, a.IdleBuilderCountThresh), 372 Body: fmt.Sprintf("%s.%s is idle with %d pen ding builds, past the alerting threshold of %d", masterName, builderName, b.Pend ingBuilds, a.IdleBuilderCountThresh),
361 Severity: idleBuilderSev, 373 Severity: idleBuilderSev,
362 Time: messages.TimeToEpochTime(a.Now()), 374 Time: messages.TimeToEpochTime(a.Now()),
363 Links: links, 375 Links: links,
376 Type: "idle-builder",
364 }) 377 })
365 } 378 }
366 default: 379 default:
367 log.Errorf("Unknown %s.%s builder state: %s", masterName, builde rName, b.State) 380 log.Errorf("Unknown %s.%s builder state: %s", masterName, builde rName, b.State)
368 } 381 }
369 382
370 // Check for alerts on the most recent complete build 383 // Check for alerts on the most recent complete build
371 log.Infof("Checking %d most recent builds for alertable step failures: % s/%s", len(recentBuildIDs), masterName, builderName) 384 log.Infof("Checking %d most recent builds for alertable step failures: % s/%s", len(recentBuildIDs), masterName, builderName)
372 as, es := a.builderStepAlerts(masterName, builderName, []int64{lastCompl etedBuild.Number}) 385 as, es := a.builderStepAlerts(masterName, builderName, []int64{lastCompl etedBuild.Number})
373 386
(...skipping 113 matching lines...) Expand 10 before | Expand all | Expand 10 after
487 mergedBF.RegressionRanges = append(mergedBF.RegressionRa nges, messages.RegressionRange{ 500 mergedBF.RegressionRanges = append(mergedBF.RegressionRa nges, messages.RegressionRange{
488 Repo: repo, 501 Repo: repo,
489 Positions: uniques(pos), 502 Positions: uniques(pos),
490 Revisions: uniques(revs), 503 Revisions: uniques(revs),
491 }) 504 })
492 } 505 }
493 506
494 sort.Sort(byRepo(mergedBF.RegressionRanges)) 507 sort.Sort(byRepo(mergedBF.RegressionRanges))
495 508
496 if len(mergedBF.Builders) > 1 { 509 if len(mergedBF.Builders) > 1 {
497 » » » merged.Title = fmt.Sprintf("%s (failing on %d builders)" , step, len(mergedBF.Builders)) 510 » » » merged.Title = fmt.Sprintf("%s failing on %d builders", step, len(mergedBF.Builders))
511 » » » builderNames := []string{}
512 » » » for _, b := range mergedBF.Builders {
513 » » » » builderNames = append(builderNames, b.Name)
514 » » » }
515 » » » merged.Body = strings.Join(builderNames, ", ")
498 } 516 }
499 merged.Extension = mergedBF 517 merged.Extension = mergedBF
500 mergedAlerts = append(mergedAlerts, merged) 518 mergedAlerts = append(mergedAlerts, merged)
501 } 519 }
502 520
503 return mergedAlerts 521 return mergedAlerts
504 } 522 }
505 523
506 // GetRevisionSummaries returns a slice of RevisionSummaries for the list of has hes. 524 // GetRevisionSummaries returns a slice of RevisionSummaries for the list of has hes.
507 func (a *Analyzer) GetRevisionSummaries(hashes []string) ([]messages.RevisionSum mary, error) { 525 func (a *Analyzer) GetRevisionSummaries(hashes []string) ([]messages.RevisionSum mary, error) {
508 ret := []messages.RevisionSummary{} 526 ret := []messages.RevisionSummary{}
509 for _, h := range hashes { 527 for _, h := range hashes {
528 a.rslck.Lock()
510 s, ok := a.revisionSummaries[h] 529 s, ok := a.revisionSummaries[h]
530 a.rslck.Unlock()
511 if !ok { 531 if !ok {
512 return nil, fmt.Errorf("Unrecognized hash: %s", h) 532 return nil, fmt.Errorf("Unrecognized hash: %s", h)
513 } 533 }
514 ret = append(ret, s) 534 ret = append(ret, s)
515 } 535 }
516 536
517 return ret, nil 537 return ret, nil
518 } 538 }
519 539
520 // builderStepAlerts scans the steps of recent builds done on a particular build er, 540 // builderStepAlerts scans the steps of recent builds done on a particular build er,
(...skipping 186 matching lines...) Expand 10 before | Expand all | Expand 10 after
707 // goroutine/channel because the reasonsForFailure call potentia lly 727 // goroutine/channel because the reasonsForFailure call potentia lly
708 // blocks on IO. 728 // blocks on IO.
709 if failure.step.Name == "steps" { 729 if failure.step.Name == "steps" {
710 // check results to see if it's an array of [4] 730 // check results to see if it's an array of [4]
711 // That's a purple failure, which should go to infra/tro oper. 731 // That's a purple failure, which should go to infra/tro oper.
712 log.Infof("steps results: %+v", failure.step) 732 log.Infof("steps results: %+v", failure.step)
713 if len(failure.step.Results) > 0 { 733 if len(failure.step.Results) > 0 {
714 if r, ok := failure.step.Results[0].(float64); o k && r == resInfraFailure { 734 if r, ok := failure.step.Results[0].(float64); o k && r == resInfraFailure {
715 // TODO: Create a trooper alert about th is. 735 // TODO: Create a trooper alert about th is.
716 log.Errorf("INFRA FAILURE: %+v", failure ) 736 log.Errorf("INFRA FAILURE: %+v", failure )
737 alr := messages.Alert{
738 Title: fmt.Sprintf("%s infra failure", failure.builderName),
739 Body: fmt.Sprintf("On step % s", failure.step.Name),
740 Type: "infra-failure",
741 Severity: infraFailureSev,
742 }
743 rs <- res{
744 f: failure,
745 a: &alr,
746 err: nil,
747 }
717 } 748 }
718 } 749 }
719 continue 750 continue
720 // The actual breaking step will appear later. 751 // The actual breaking step will appear later.
721 } 752 }
722 753
723 // Check the gatekeeper configs to see if this is ignorable. 754 // Check the gatekeeper configs to see if this is ignorable.
724 » » if a.excludeFailure(failure.masterName, failure.builderName, fai lure.step.Name) { 755 » » if a.Gatekeeper.ExcludeFailure(failure.masterName, failure.build erName, failure.step.Name) {
725 continue 756 continue
726 } 757 }
727 758
728 // Gets the named revision number from gnumbd metadata. 759 // Gets the named revision number from gnumbd metadata.
729 getCommitPos := func(b messages.Build, name string) (string, boo l) { 760 getCommitPos := func(b messages.Build, name string) (string, boo l) {
730 for _, p := range b.Properties { 761 for _, p := range b.Properties {
731 if p[0] == name { 762 if p[0] == name {
732 s, ok := p[1].(string) 763 s, ok := p[1].(string)
733 return s, ok 764 return s, ok
734 } 765 }
735 } 766 }
736 return "", false 767 return "", false
737 } 768 }
738 769
739 scannedFailures = append(scannedFailures, failure) 770 scannedFailures = append(scannedFailures, failure)
740 go func(f stepFailure) { 771 go func(f stepFailure) {
741 expvars.Add("StepFailures", 1) 772 expvars.Add("StepFailures", 1)
742 defer expvars.Add("StepFailures", -1) 773 defer expvars.Add("StepFailures", -1)
743 alr := messages.Alert{ 774 alr := messages.Alert{
744 » » » » Title: fmt.Sprintf("Builder step failure: %s.%s" , f.masterName, f.builderName), 775 » » » » Title: fmt.Sprintf("%s step failure", f.build erName),
745 » » » » Time: messages.EpochTime(a.Now().Unix()), 776 » » » » Body: fmt.Sprintf("%s failing on %s/%s", f.s tep.Name, f.masterName, f.builderName),
746 » » » » Type: "buildfailure", 777 » » » » Time: messages.EpochTime(a.Now().Unix()),
778 » » » » Type: "buildfailure",
779 » » » » Severity: newFailureSev,
747 } 780 }
748 781
749 regRanges := []messages.RegressionRange{} 782 regRanges := []messages.RegressionRange{}
750 revisionsByRepo := map[string][]string{} 783 revisionsByRepo := map[string][]string{}
751 784
752 // Get gnumbd sequence numbers for whatever this build p ulled in. 785 // Get gnumbd sequence numbers for whatever this build p ulled in.
753 chromiumPos, ok := getCommitPos(f.build, "got_revision_c p") 786 chromiumPos, ok := getCommitPos(f.build, "got_revision_c p")
754 if ok { 787 if ok {
755 regRanges = append(regRanges, messages.Regressio nRange{ 788 regRanges = append(regRanges, messages.Regressio nRange{
756 Repo: "chromium", 789 Repo: "chromium",
(...skipping 23 matching lines...) Expand all
780 Repo: "nacl", 813 Repo: "nacl",
781 Positions: []string{naclPos}, 814 Positions: []string{naclPos},
782 }) 815 })
783 } 816 }
784 817
785 for _, change := range f.build.SourceStamp.Changes { 818 for _, change := range f.build.SourceStamp.Changes {
786 revisionsByRepo[change.Repository] = append(revi sionsByRepo[change.Repository], change.Revision) 819 revisionsByRepo[change.Repository] = append(revi sionsByRepo[change.Repository], change.Revision)
787 // change.Revision is *not* always a git hash. S ometimes it is a position from gnumbd. 820 // change.Revision is *not* always a git hash. S ometimes it is a position from gnumbd.
788 // change.Revision is git hash or gnumbd dependi ng on what exactly? Not obvious at this time. 821 // change.Revision is git hash or gnumbd dependi ng on what exactly? Not obvious at this time.
789 // A potential problem here is when multiple rep os have overlapping gnumbd ranges. 822 // A potential problem here is when multiple rep os have overlapping gnumbd ranges.
823 parts := cpRE.FindAllStringSubmatch(change.Comme nts, -1)
824 pos, branch := "", ""
825 if len(parts) > 0 {
826 branch = parts[0][1]
827 pos = parts[0][2]
828 }
829 a.rslck.Lock()
790 a.revisionSummaries[change.Revision] = messages. RevisionSummary{ 830 a.revisionSummaries[change.Revision] = messages. RevisionSummary{
791 GitHash: change.Revision, 831 GitHash: change.Revision,
792 Link: change.Revlink, 832 Link: change.Revlink,
793 Description: trunc(change.Comments), 833 Description: trunc(change.Comments),
794 Author: change.Who, 834 Author: change.Who,
795 When: change.When, 835 When: change.When,
836 Position: pos,
837 Branch: branch,
796 } 838 }
839 a.rslck.Unlock()
797 } 840 }
798 841
799 for repo, revisions := range revisionsByRepo { 842 for repo, revisions := range revisionsByRepo {
800 regRanges = append(regRanges, messages.Regressio nRange{ 843 regRanges = append(regRanges, messages.Regressio nRange{
801 Repo: repo, 844 Repo: repo,
802 Revisions: revisions, 845 Revisions: revisions,
803 }) 846 })
804 } 847 }
805 848
806 // If the builder has been failing on the same step for multiple builds in a row, 849 // If the builder has been failing on the same step for multiple builds in a row,
807 // we should have only one alert but indicate the range of builds affected. 850 // we should have only one alert but indicate the range of builds affected.
808 // These are set in FirstFailure and LastFailure. 851 // These are set in FirstFailure and LastFailure.
809 bf := messages.BuildFailure{ 852 bf := messages.BuildFailure{
810 // FIXME: group builders? 853 // FIXME: group builders?
811 Builders: []messages.AlertedBuilder{ 854 Builders: []messages.AlertedBuilder{
812 { 855 {
813 Name: f.builderName, 856 Name: f.builderName,
814 URL: client.BuilderURL (f.masterName, f.builderName), 857 URL: client.BuilderURL (f.masterName, f.builderName),
815 StartTime: f.build.CreatedTi mestamp, 858 StartTime: f.build.CreatedTi mestamp,
816 FirstFailure: f.build.Number, 859 FirstFailure: f.build.Number,
817 LatestFailure: f.build.Number, 860 LatestFailure: f.build.Number,
818 }, 861 },
819 }, 862 },
820 » » » » TreeCloser: a.wouldCloseTree(f.masterName, f.builderName, f.step.Name), 863 » » » » TreeCloser: a.Gatekeeper.WouldCloseTree(f. masterName, f.builderName, f.step.Name),
821 RegressionRanges: regRanges, 864 RegressionRanges: regRanges,
822 } 865 }
823 866
867 if bf.TreeCloser {
868 alr.Severity = treeCloserSev
869 }
870
824 reasons := a.reasonsForFailure(f) 871 reasons := a.reasonsForFailure(f)
825 for _, r := range reasons { 872 for _, r := range reasons {
826 bf.Reasons = append(bf.Reasons, messages.Reason{ 873 bf.Reasons = append(bf.Reasons, messages.Reason{
827 TestName: r, 874 TestName: r,
828 Step: f.step.Name, 875 Step: f.step.Name,
829 URL: f.URL(), 876 URL: f.URL(),
830 }) 877 })
831 } 878 }
832 879
833 alr.Key = alertKey(f.masterName, f.builderName, f.step.N ame) 880 alr.Key = alertKey(f.masterName, f.builderName, f.step.N ame)
(...skipping 54 matching lines...) Expand 10 before | Expand all | Expand 10 after
888 935
889 if !recognized { 936 if !recognized {
890 // TODO: log and report frequently encountered unrecognized buil der step 937 // TODO: log and report frequently encountered unrecognized buil der step
891 // failure names. 938 // failure names.
892 log.Errorf("Unrecognized step step failure type, unable to find reasons: %s", f.step.Name) 939 log.Errorf("Unrecognized step step failure type, unable to find reasons: %s", f.step.Name)
893 } 940 }
894 941
895 return ret 942 return ret
896 } 943 }
897 944
898 func (a *Analyzer) excludeFailure(master, builder, step string) bool {
899 mc, ok := a.MasterCfgs[master]
900 if !ok {
901 log.Errorf("Can't filter unknown master %s", master)
902 return false
903 }
904
905 for _, ebName := range mc.ExcludedBuilders {
906 if ebName == "*" || ebName == builder {
907 return true
908 }
909 }
910
911 // Not clear that builder_alerts even looks at the rest of these condtio ns
912 // even though they're specified in gatekeeper.json
913 for _, s := range mc.ExcludedSteps {
914 if step == s {
915 return true
916 }
917 }
918
919 bc, ok := mc.Builders[builder]
920 if !ok {
921 if bc, ok = mc.Builders["*"]; !ok {
922 log.Warningf("Unknown %s builder %s", master, builder)
923 return true
924 }
925 }
926
927 for _, esName := range bc.ExcludedSteps {
928 if esName == step || esName == "*" {
929 return true
930 }
931 }
932
933 return false
934 }
935
936 func (a *Analyzer) wouldCloseTree(master, builder, step string) bool {
937 mc, ok := a.MasterCfgs[master]
938 if !ok {
939 log.Errorf("Missing master cfg: %s", master)
940 return false
941 }
942 bc, ok := mc.Builders[builder]
943 if !ok {
944 bc, ok = mc.Builders["*"]
945 if ok {
946 return true
947 }
948 }
949
950 for _, xstep := range bc.ExcludedSteps {
951 if xstep == step {
952 return false
953 }
954 }
955
956 csteps := []string{}
957 csteps = append(csteps, bc.ClosingSteps...)
958 csteps = append(csteps, bc.ClosingOptional...)
959
960 for _, cs := range csteps {
961 if cs == "*" || cs == step {
962 return true
963 }
964 }
965
966 return false
967 }
968
969 // unexpected returns the set of expected xor actual. 945 // unexpected returns the set of expected xor actual.
970 func unexpected(expected, actual []string) []string { 946 func unexpected(expected, actual []string) []string {
971 e, a := make(map[string]bool), make(map[string]bool) 947 e, a := make(map[string]bool), make(map[string]bool)
972 for _, s := range expected { 948 for _, s := range expected {
973 e[s] = true 949 e[s] = true
974 } 950 }
975 for _, s := range actual { 951 for _, s := range actual {
976 a[s] = true 952 a[s] = true
977 } 953 }
978 954
(...skipping 17 matching lines...) Expand all
996 masterName string 972 masterName string
997 builderName string 973 builderName string
998 build messages.Build 974 build messages.Build
999 step messages.Step 975 step messages.Step
1000 } 976 }
1001 977
1002 // URL returns a url to builder step failure page. 978 // URL returns a url to builder step failure page.
1003 func (f stepFailure) URL() string { 979 func (f stepFailure) URL() string {
1004 return client.StepURL(f.masterName, f.builderName, f.step.Name, f.build. Number) 980 return client.StepURL(f.masterName, f.builderName, f.step.Name, f.build. Number)
1005 } 981 }
OLDNEW
« no previous file with comments | « no previous file | go/src/infra/monitoring/analyzer/analyzer_test.go » ('j') | go/src/infra/monitoring/analyzer/gatekeeper.go » ('J')

Powered by Google App Engine
This is Rietveld 408576698