| OLD | NEW |
| 1 // Copyright 2015 The Chromium Authors. All rights reserved. | 1 // Copyright 2015 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 | 4 |
| 5 package analyzer | 5 package analyzer |
| 6 | 6 |
| 7 import ( | 7 import ( |
| 8 "errors" | 8 "errors" |
| 9 "expvar" | 9 "expvar" |
| 10 "fmt" | 10 "fmt" |
| 11 "regexp" |
| 11 "sort" | 12 "sort" |
| 12 "strings" | 13 "strings" |
| 14 "sync" |
| 13 "time" | 15 "time" |
| 14 | 16 |
| 15 "github.com/luci/luci-go/common/logging/gologger" | 17 "github.com/luci/luci-go/common/logging/gologger" |
| 16 | 18 |
| 17 "infra/monitoring/client" | 19 "infra/monitoring/client" |
| 18 "infra/monitoring/messages" | 20 "infra/monitoring/messages" |
| 19 ) | 21 ) |
| 20 | 22 |
| 21 const ( | 23 const ( |
| 22 // StepCompletedRun is a synthetic step name used to indicate the build
run is complete. | 24 // StepCompletedRun is a synthetic step name used to indicate the build
run is complete. |
| 23 » StepCompletedRun = "completed run" | 25 » StepCompletedRun = "completed run" |
| 24 » treeCloserPri = 0 | 26 |
| 25 » reliableFailureSev = 0 | 27 » // Order of severity, worst to least bad. |
| 26 » newFailureSev = 1 | 28 » treeCloserSev = iota |
| 27 » staleMasterSev = 0 | 29 » staleMasterSev |
| 28 » staleBuilderSev = 0 | 30 » infraFailureSev |
| 29 » hungBuilderSev = 1 | 31 » reliableFailureSev |
| 30 » idleBuilderSev = 1 | 32 » newFailureSev |
| 31 » offlineBuilderSev = 1 | 33 » staleBuilderSev |
| 32 » resOK = float64(1) | 34 » hungBuilderSev |
| 33 » resInfraFailure = float64(4) | 35 » idleBuilderSev |
| 36 » offlineBuilderSev |
| 37 |
| 38 » // Step result values. |
| 39 » resOK = float64(1) |
| 40 » resInfraFailure = float64(4) |
| 34 ) | 41 ) |
| 35 | 42 |
| 36 var ( | 43 var ( |
| 37 log = gologger.Get() | 44 log = gologger.Get() |
| 38 expvars = expvar.NewMap("analyzer") | 45 expvars = expvar.NewMap("analyzer") |
| 46 cpRE = regexp.MustCompile("Cr-Commit-Position: (.*)@{#([0-9]+)}") |
| 39 ) | 47 ) |
| 40 | 48 |
| 41 var ( | 49 var ( |
| 42 errNoBuildSteps = errors.New("No build steps") | 50 errNoBuildSteps = errors.New("No build steps") |
| 43 errNoRecentBuilds = errors.New("No recent builds") | 51 errNoRecentBuilds = errors.New("No recent builds") |
| 44 errNoCompletedBuilds = errors.New("No completed builds") | 52 errNoCompletedBuilds = errors.New("No completed builds") |
| 45 ) | 53 ) |
| 46 | 54 |
| 47 // StepAnalyzer reasons about a stepFailure and produces a set of reasons for th
e | 55 // StepAnalyzer reasons about a stepFailure and produces a set of reasons for th
e |
| 48 // failure. It also indicates whether or not it recognizes the stepFailure. | 56 // failure. It also indicates whether or not it recognizes the stepFailure. |
| (...skipping 38 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 87 OfflineBuilderThresh time.Duration | 95 OfflineBuilderThresh time.Duration |
| 88 | 96 |
| 89 // IdleBuilderCountThresh is the maximum number of builds a builder may
have in queue | 97 // IdleBuilderCountThresh is the maximum number of builds a builder may
have in queue |
| 90 // while in the "idle" state before triggering an "idle builder" alert. | 98 // while in the "idle" state before triggering an "idle builder" alert. |
| 91 IdleBuilderCountThresh int64 | 99 IdleBuilderCountThresh int64 |
| 92 | 100 |
| 93 // StaleMasterThreshold is the maximum age that master data from CBE can
be before | 101 // StaleMasterThreshold is the maximum age that master data from CBE can
be before |
| 94 // triggering a "stale master" alert. | 102 // triggering a "stale master" alert. |
| 95 StaleMasterThreshold time.Duration | 103 StaleMasterThreshold time.Duration |
| 96 | 104 |
| 97 » // MasterCfgs is a map of master name to MasterConfig | 105 » // Gatekeeper is a the parsed gatekeeper.json config file. |
| 98 » MasterCfgs map[string]messages.MasterConfig | 106 » Gatekeeper *GatekeeperRules |
| 99 | 107 |
| 100 // These limit the scope analysis, useful for debugging. | 108 // These limit the scope analysis, useful for debugging. |
| 101 MasterOnly string | 109 MasterOnly string |
| 102 BuilderOnly string | 110 BuilderOnly string |
| 103 BuildOnly int64 | 111 BuildOnly int64 |
| 104 | 112 |
| 113 // rslck protects revisionSummaries from concurrent access. |
| 114 rslck *sync.Mutex |
| 105 revisionSummaries map[string]messages.RevisionSummary | 115 revisionSummaries map[string]messages.RevisionSummary |
| 106 | 116 |
| 107 // Now is useful for mocking the system clock in testing and simulating
time | 117 // Now is useful for mocking the system clock in testing and simulating
time |
| 108 // during replay. | 118 // during replay. |
| 109 Now func() time.Time | 119 Now func() time.Time |
| 110 } | 120 } |
| 111 | 121 |
| 112 // New returns a new Analyzer. If client is nil, it assigns a default implementa
tion. | 122 // New returns a new Analyzer. If client is nil, it assigns a default implementa
tion. |
| 113 // maxBuilds is the maximum number of builds to check, per builder. | 123 // maxBuilds is the maximum number of builds to check, per builder. |
| 114 func New(c client.Reader, minBuilds, maxBuilds int) *Analyzer { | 124 func New(c client.Reader, minBuilds, maxBuilds int) *Analyzer { |
| 115 if c == nil { | 125 if c == nil { |
| 116 c = client.NewReader(nil) | 126 c = client.NewReader(nil) |
| 117 } | 127 } |
| 118 | 128 |
| 119 return &Analyzer{ | 129 return &Analyzer{ |
| 120 Reader: c, | 130 Reader: c, |
| 121 MaxRecentBuilds: maxBuilds, | 131 MaxRecentBuilds: maxBuilds, |
| 122 MinRecentBuilds: minBuilds, | 132 MinRecentBuilds: minBuilds, |
| 123 HungBuilderThresh: 3 * time.Hour, | 133 HungBuilderThresh: 3 * time.Hour, |
| 124 OfflineBuilderThresh: 90 * time.Minute, | 134 OfflineBuilderThresh: 90 * time.Minute, |
| 125 IdleBuilderCountThresh: 50, | 135 IdleBuilderCountThresh: 50, |
| 126 StaleMasterThreshold: 10 * time.Minute, | 136 StaleMasterThreshold: 10 * time.Minute, |
| 127 StepAnalyzers: []StepAnalyzer{ | 137 StepAnalyzers: []StepAnalyzer{ |
| 128 &TestFailureAnalyzer{Reader: c}, | 138 &TestFailureAnalyzer{Reader: c}, |
| 129 &CompileFailureAnalyzer{Reader: c}, | 139 &CompileFailureAnalyzer{Reader: c}, |
| 130 }, | 140 }, |
| 131 » » MasterCfgs: map[string]messages.MasterConfig{}, | 141 » » Gatekeeper: &GatekeeperRules{}, |
| 132 | 142 » » rslck: &sync.Mutex{}, |
| 133 revisionSummaries: map[string]messages.RevisionSummary{}, | 143 revisionSummaries: map[string]messages.RevisionSummary{}, |
| 134 Now: func() time.Time { | 144 Now: func() time.Time { |
| 135 return time.Now() | 145 return time.Now() |
| 136 }, | 146 }, |
| 137 } | 147 } |
| 138 } | 148 } |
| 139 | 149 |
| 140 // MasterAlerts returns alerts generated from the master. | 150 // MasterAlerts returns alerts generated from the master. |
| 141 func (a *Analyzer) MasterAlerts(master string, be *messages.BuildExtract) []mess
ages.Alert { | 151 func (a *Analyzer) MasterAlerts(master string, be *messages.BuildExtract) []mess
ages.Alert { |
| 142 ret := []messages.Alert{} | 152 ret := []messages.Alert{} |
| 143 | 153 |
| 144 // Copied logic from builder_messages. | 154 // Copied logic from builder_messages. |
| 145 // No created_timestamp should be a warning sign, no? | 155 // No created_timestamp should be a warning sign, no? |
| 146 if be.CreatedTimestamp == messages.EpochTime(0) { | 156 if be.CreatedTimestamp == messages.EpochTime(0) { |
| 147 return ret | 157 return ret |
| 148 } | 158 } |
| 149 expvars.Add("MasterAlerts", 1) | 159 expvars.Add("MasterAlerts", 1) |
| 150 defer expvars.Add("MasterAlerts", -1) | 160 defer expvars.Add("MasterAlerts", -1) |
| 151 elapsed := a.Now().Sub(be.CreatedTimestamp.Time()) | 161 elapsed := a.Now().Sub(be.CreatedTimestamp.Time()) |
| 152 if elapsed > a.StaleMasterThreshold { | 162 if elapsed > a.StaleMasterThreshold { |
| 153 ret = append(ret, messages.Alert{ | 163 ret = append(ret, messages.Alert{ |
| 154 Key: fmt.Sprintf("stale master: %v", master), | 164 Key: fmt.Sprintf("stale master: %v", master), |
| 155 Title: fmt.Sprintf("Stale %s master data", master), | 165 Title: fmt.Sprintf("Stale %s master data", master), |
| 156 » » » Body: fmt.Sprintf("%s elapsed since last update.",
elapsed), | 166 » » » Body: fmt.Sprintf("%dh %2dm elapsed since last upda
te.", int(elapsed.Hours()), int(elapsed.Minutes())), |
| 157 StartTime: messages.TimeToEpochTime(be.CreatedTimestamp.
Time()), | 167 StartTime: messages.TimeToEpochTime(be.CreatedTimestamp.
Time()), |
| 158 Severity: staleMasterSev, | 168 Severity: staleMasterSev, |
| 159 Time: messages.TimeToEpochTime(a.Now()), | 169 Time: messages.TimeToEpochTime(a.Now()), |
| 160 Links: []messages.Link{{"Master", client.MasterURL(m
aster)}}, | 170 Links: []messages.Link{{"Master", client.MasterURL(m
aster)}}, |
| 161 » » » // No type or extension for now. | 171 » » » Type: "stale-master", |
| 172 » » » // No extension for now. |
| 162 }) | 173 }) |
| 163 } | 174 } |
| 164 if elapsed < 0 { | 175 if elapsed < 0 { |
| 165 // Add this to the alerts returned, rather than just log it? | 176 // Add this to the alerts returned, rather than just log it? |
| 166 log.Errorf("Master %s timestamp is newer than current time (%s):
%s old.", master, a.Now(), elapsed) | 177 log.Errorf("Master %s timestamp is newer than current time (%s):
%s old.", master, a.Now(), elapsed) |
| 167 } | 178 } |
| 168 | 179 |
| 169 return ret | 180 return ret |
| 170 } | 181 } |
| 171 | 182 |
| (...skipping 158 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 330 switch b.State { | 341 switch b.State { |
| 331 case messages.StateBuilding: | 342 case messages.StateBuilding: |
| 332 if elapsed > a.HungBuilderThresh && lastStep != StepCompletedRun
{ | 343 if elapsed > a.HungBuilderThresh && lastStep != StepCompletedRun
{ |
| 333 alerts = append(alerts, messages.Alert{ | 344 alerts = append(alerts, messages.Alert{ |
| 334 Key: fmt.Sprintf("%s.%s.hung", masterName,
builderName), | 345 Key: fmt.Sprintf("%s.%s.hung", masterName,
builderName), |
| 335 Title: fmt.Sprintf("%s.%s is hung in step %s.
", masterName, builderName, lastStep), | 346 Title: fmt.Sprintf("%s.%s is hung in step %s.
", masterName, builderName, lastStep), |
| 336 Body: fmt.Sprintf("%s.%s has been building f
or %v (last step update %s), past the alerting threshold of %v", masterName, bui
lderName, elapsed, lastUpdated.Time(), a.HungBuilderThresh), | 347 Body: fmt.Sprintf("%s.%s has been building f
or %v (last step update %s), past the alerting threshold of %v", masterName, bui
lderName, elapsed, lastUpdated.Time(), a.HungBuilderThresh), |
| 337 Severity: hungBuilderSev, | 348 Severity: hungBuilderSev, |
| 338 Time: messages.TimeToEpochTime(a.Now()), | 349 Time: messages.TimeToEpochTime(a.Now()), |
| 339 Links: links, | 350 Links: links, |
| 351 Type: "hung-builder", |
| 340 }) | 352 }) |
| 341 // Note, just because it's building doesn't mean it's in
a good state. If the last N builds | 353 // Note, just because it's building doesn't mean it's in
a good state. If the last N builds |
| 342 // all failed (for some large N) then this might still b
e alertable. | 354 // all failed (for some large N) then this might still b
e alertable. |
| 343 } | 355 } |
| 344 case messages.StateOffline: | 356 case messages.StateOffline: |
| 345 if elapsed > a.OfflineBuilderThresh { | 357 if elapsed > a.OfflineBuilderThresh { |
| 346 alerts = append(alerts, messages.Alert{ | 358 alerts = append(alerts, messages.Alert{ |
| 347 Key: fmt.Sprintf("%s.%s.offline", masterNam
e, builderName), | 359 Key: fmt.Sprintf("%s.%s.offline", masterNam
e, builderName), |
| 348 Title: fmt.Sprintf("%s.%s is offline.", maste
rName, builderName), | 360 Title: fmt.Sprintf("%s.%s is offline.", maste
rName, builderName), |
| 349 Body: fmt.Sprintf("%s.%s has been offline fo
r %v (last step update %s %v), past the alerting threshold of %v", masterName, b
uilderName, elapsed, lastUpdated.Time(), float64(lastUpdated), a.OfflineBuilderT
hresh), | 361 Body: fmt.Sprintf("%s.%s has been offline fo
r %v (last step update %s %v), past the alerting threshold of %v", masterName, b
uilderName, elapsed, lastUpdated.Time(), float64(lastUpdated), a.OfflineBuilderT
hresh), |
| 350 Severity: offlineBuilderSev, | 362 Severity: offlineBuilderSev, |
| 351 Time: messages.TimeToEpochTime(a.Now()), | 363 Time: messages.TimeToEpochTime(a.Now()), |
| 352 Links: links, | 364 Links: links, |
| 365 Type: "offline-builder", |
| 353 }) | 366 }) |
| 354 } | 367 } |
| 355 case messages.StateIdle: | 368 case messages.StateIdle: |
| 356 if b.PendingBuilds > a.IdleBuilderCountThresh { | 369 if b.PendingBuilds > a.IdleBuilderCountThresh { |
| 357 alerts = append(alerts, messages.Alert{ | 370 alerts = append(alerts, messages.Alert{ |
| 358 Key: fmt.Sprintf("%s.%s.idle", masterName,
builderName), | 371 Key: fmt.Sprintf("%s.%s.idle", masterName,
builderName), |
| 359 Title: fmt.Sprintf("%s.%s is idle with too ma
ny pending builds.", masterName, builderName), | 372 Title: fmt.Sprintf("%s.%s is idle with too ma
ny pending builds.", masterName, builderName), |
| 360 Body: fmt.Sprintf("%s.%s is idle with %d pen
ding builds, past the alerting threshold of %d", masterName, builderName, b.Pend
ingBuilds, a.IdleBuilderCountThresh), | 373 Body: fmt.Sprintf("%s.%s is idle with %d pen
ding builds, past the alerting threshold of %d", masterName, builderName, b.Pend
ingBuilds, a.IdleBuilderCountThresh), |
| 361 Severity: idleBuilderSev, | 374 Severity: idleBuilderSev, |
| 362 Time: messages.TimeToEpochTime(a.Now()), | 375 Time: messages.TimeToEpochTime(a.Now()), |
| 363 Links: links, | 376 Links: links, |
| 377 Type: "idle-builder", |
| 364 }) | 378 }) |
| 365 } | 379 } |
| 366 default: | 380 default: |
| 367 log.Errorf("Unknown %s.%s builder state: %s", masterName, builde
rName, b.State) | 381 log.Errorf("Unknown %s.%s builder state: %s", masterName, builde
rName, b.State) |
| 368 } | 382 } |
| 369 | 383 |
| 370 // Check for alerts on the most recent complete build | 384 // Check for alerts on the most recent complete build |
| 371 log.Infof("Checking %d most recent builds for alertable step failures: %
s/%s", len(recentBuildIDs), masterName, builderName) | 385 log.Infof("Checking %d most recent builds for alertable step failures: %
s/%s", len(recentBuildIDs), masterName, builderName) |
| 372 as, es := a.builderStepAlerts(masterName, builderName, []int64{lastCompl
etedBuild.Number}) | 386 as, es := a.builderStepAlerts(masterName, builderName, []int64{lastCompl
etedBuild.Number}) |
| 373 | 387 |
| (...skipping 113 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 487 mergedBF.RegressionRanges = append(mergedBF.RegressionRa
nges, messages.RegressionRange{ | 501 mergedBF.RegressionRanges = append(mergedBF.RegressionRa
nges, messages.RegressionRange{ |
| 488 Repo: repo, | 502 Repo: repo, |
| 489 Positions: uniques(pos), | 503 Positions: uniques(pos), |
| 490 Revisions: uniques(revs), | 504 Revisions: uniques(revs), |
| 491 }) | 505 }) |
| 492 } | 506 } |
| 493 | 507 |
| 494 sort.Sort(byRepo(mergedBF.RegressionRanges)) | 508 sort.Sort(byRepo(mergedBF.RegressionRanges)) |
| 495 | 509 |
| 496 if len(mergedBF.Builders) > 1 { | 510 if len(mergedBF.Builders) > 1 { |
| 497 » » » merged.Title = fmt.Sprintf("%s (failing on %d builders)"
, step, len(mergedBF.Builders)) | 511 » » » merged.Title = fmt.Sprintf("%s failing on %d builders",
step, len(mergedBF.Builders)) |
| 512 » » » builderNames := []string{} |
| 513 » » » for _, b := range mergedBF.Builders { |
| 514 » » » » builderNames = append(builderNames, b.Name) |
| 515 » » » } |
| 516 » » » merged.Body = strings.Join(builderNames, ", ") |
| 498 } | 517 } |
| 499 merged.Extension = mergedBF | 518 merged.Extension = mergedBF |
| 500 mergedAlerts = append(mergedAlerts, merged) | 519 mergedAlerts = append(mergedAlerts, merged) |
| 501 } | 520 } |
| 502 | 521 |
| 503 return mergedAlerts | 522 return mergedAlerts |
| 504 } | 523 } |
| 505 | 524 |
| 506 // GetRevisionSummaries returns a slice of RevisionSummaries for the list of has
hes. | 525 // GetRevisionSummaries returns a slice of RevisionSummaries for the list of has
hes. |
| 507 func (a *Analyzer) GetRevisionSummaries(hashes []string) ([]messages.RevisionSum
mary, error) { | 526 func (a *Analyzer) GetRevisionSummaries(hashes []string) ([]messages.RevisionSum
mary, error) { |
| 508 ret := []messages.RevisionSummary{} | 527 ret := []messages.RevisionSummary{} |
| 509 for _, h := range hashes { | 528 for _, h := range hashes { |
| 529 a.rslck.Lock() |
| 510 s, ok := a.revisionSummaries[h] | 530 s, ok := a.revisionSummaries[h] |
| 531 a.rslck.Unlock() |
| 511 if !ok { | 532 if !ok { |
| 512 return nil, fmt.Errorf("Unrecognized hash: %s", h) | 533 return nil, fmt.Errorf("Unrecognized hash: %s", h) |
| 513 } | 534 } |
| 514 ret = append(ret, s) | 535 ret = append(ret, s) |
| 515 } | 536 } |
| 516 | 537 |
| 517 return ret, nil | 538 return ret, nil |
| 518 } | 539 } |
| 519 | 540 |
| 520 // builderStepAlerts scans the steps of recent builds done on a particular build
er, | 541 // builderStepAlerts scans the steps of recent builds done on a particular build
er, |
| (...skipping 186 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 707 // goroutine/channel because the reasonsForFailure call potentia
lly | 728 // goroutine/channel because the reasonsForFailure call potentia
lly |
| 708 // blocks on IO. | 729 // blocks on IO. |
| 709 if failure.step.Name == "steps" { | 730 if failure.step.Name == "steps" { |
| 710 // check results to see if it's an array of [4] | 731 // check results to see if it's an array of [4] |
| 711 // That's a purple failure, which should go to infra/tro
oper. | 732 // That's a purple failure, which should go to infra/tro
oper. |
| 712 log.Infof("steps results: %+v", failure.step) | 733 log.Infof("steps results: %+v", failure.step) |
| 713 if len(failure.step.Results) > 0 { | 734 if len(failure.step.Results) > 0 { |
| 714 if r, ok := failure.step.Results[0].(float64); o
k && r == resInfraFailure { | 735 if r, ok := failure.step.Results[0].(float64); o
k && r == resInfraFailure { |
| 715 // TODO: Create a trooper alert about th
is. | 736 // TODO: Create a trooper alert about th
is. |
| 716 log.Errorf("INFRA FAILURE: %+v", failure
) | 737 log.Errorf("INFRA FAILURE: %+v", failure
) |
| 738 alr := messages.Alert{ |
| 739 Title: fmt.Sprintf("%s infra
failure", failure.builderName), |
| 740 Body: fmt.Sprintf("On step %
s", failure.step.Name), |
| 741 Type: "infra-failure", |
| 742 Severity: infraFailureSev, |
| 743 } |
| 744 rs <- res{ |
| 745 f: failure, |
| 746 a: &alr, |
| 747 err: nil, |
| 748 } |
| 717 } | 749 } |
| 718 } | 750 } |
| 719 continue | 751 continue |
| 720 // The actual breaking step will appear later. | 752 // The actual breaking step will appear later. |
| 721 } | 753 } |
| 722 | 754 |
| 723 // Check the gatekeeper configs to see if this is ignorable. | 755 // Check the gatekeeper configs to see if this is ignorable. |
| 724 » » if a.excludeFailure(failure.masterName, failure.builderName, fai
lure.step.Name) { | 756 » » if a.Gatekeeper.ExcludeFailure(failure.masterName, failure.build
erName, failure.step.Name) { |
| 725 continue | 757 continue |
| 726 } | 758 } |
| 727 | 759 |
| 728 // Gets the named revision number from gnumbd metadata. | 760 // Gets the named revision number from gnumbd metadata. |
| 729 getCommitPos := func(b messages.Build, name string) (string, boo
l) { | 761 getCommitPos := func(b messages.Build, name string) (string, boo
l) { |
| 730 for _, p := range b.Properties { | 762 for _, p := range b.Properties { |
| 731 if p[0] == name { | 763 if p[0] == name { |
| 732 s, ok := p[1].(string) | 764 s, ok := p[1].(string) |
| 733 return s, ok | 765 return s, ok |
| 734 } | 766 } |
| 735 } | 767 } |
| 736 return "", false | 768 return "", false |
| 737 } | 769 } |
| 738 | 770 |
| 739 scannedFailures = append(scannedFailures, failure) | 771 scannedFailures = append(scannedFailures, failure) |
| 740 go func(f stepFailure) { | 772 go func(f stepFailure) { |
| 741 expvars.Add("StepFailures", 1) | 773 expvars.Add("StepFailures", 1) |
| 742 defer expvars.Add("StepFailures", -1) | 774 defer expvars.Add("StepFailures", -1) |
| 743 alr := messages.Alert{ | 775 alr := messages.Alert{ |
| 744 » » » » Title: fmt.Sprintf("Builder step failure: %s.%s"
, f.masterName, f.builderName), | 776 » » » » Title: fmt.Sprintf("%s step failure", f.build
erName), |
| 745 » » » » Time: messages.EpochTime(a.Now().Unix()), | 777 » » » » Body: fmt.Sprintf("%s failing on %s/%s", f.s
tep.Name, f.masterName, f.builderName), |
| 746 » » » » Type: "buildfailure", | 778 » » » » Time: messages.EpochTime(a.Now().Unix()), |
| 779 » » » » Type: "buildfailure", |
| 780 » » » » Severity: newFailureSev, |
| 747 } | 781 } |
| 748 | 782 |
| 749 regRanges := []messages.RegressionRange{} | 783 regRanges := []messages.RegressionRange{} |
| 750 revisionsByRepo := map[string][]string{} | 784 revisionsByRepo := map[string][]string{} |
| 751 | 785 |
| 752 // Get gnumbd sequence numbers for whatever this build p
ulled in. | 786 // Get gnumbd sequence numbers for whatever this build p
ulled in. |
| 753 chromiumPos, ok := getCommitPos(f.build, "got_revision_c
p") | 787 chromiumPos, ok := getCommitPos(f.build, "got_revision_c
p") |
| 754 if ok { | 788 if ok { |
| 755 regRanges = append(regRanges, messages.Regressio
nRange{ | 789 regRanges = append(regRanges, messages.Regressio
nRange{ |
| 756 Repo: "chromium", | 790 Repo: "chromium", |
| (...skipping 23 matching lines...) Expand all Loading... |
| 780 Repo: "nacl", | 814 Repo: "nacl", |
| 781 Positions: []string{naclPos}, | 815 Positions: []string{naclPos}, |
| 782 }) | 816 }) |
| 783 } | 817 } |
| 784 | 818 |
| 785 for _, change := range f.build.SourceStamp.Changes { | 819 for _, change := range f.build.SourceStamp.Changes { |
| 786 revisionsByRepo[change.Repository] = append(revi
sionsByRepo[change.Repository], change.Revision) | 820 revisionsByRepo[change.Repository] = append(revi
sionsByRepo[change.Repository], change.Revision) |
| 787 // change.Revision is *not* always a git hash. S
ometimes it is a position from gnumbd. | 821 // change.Revision is *not* always a git hash. S
ometimes it is a position from gnumbd. |
| 788 // change.Revision is git hash or gnumbd dependi
ng on what exactly? Not obvious at this time. | 822 // change.Revision is git hash or gnumbd dependi
ng on what exactly? Not obvious at this time. |
| 789 // A potential problem here is when multiple rep
os have overlapping gnumbd ranges. | 823 // A potential problem here is when multiple rep
os have overlapping gnumbd ranges. |
| 824 parts := cpRE.FindAllStringSubmatch(change.Comme
nts, -1) |
| 825 pos, branch := "", "" |
| 826 if len(parts) > 0 { |
| 827 branch = parts[0][1] |
| 828 pos = parts[0][2] |
| 829 } |
| 830 a.rslck.Lock() |
| 790 a.revisionSummaries[change.Revision] = messages.
RevisionSummary{ | 831 a.revisionSummaries[change.Revision] = messages.
RevisionSummary{ |
| 791 GitHash: change.Revision, | 832 GitHash: change.Revision, |
| 792 Link: change.Revlink, | 833 Link: change.Revlink, |
| 793 Description: trunc(change.Comments), | 834 Description: trunc(change.Comments), |
| 794 Author: change.Who, | 835 Author: change.Who, |
| 795 When: change.When, | 836 When: change.When, |
| 837 Position: pos, |
| 838 Branch: branch, |
| 796 } | 839 } |
| 840 a.rslck.Unlock() |
| 797 } | 841 } |
| 798 | 842 |
| 799 for repo, revisions := range revisionsByRepo { | 843 for repo, revisions := range revisionsByRepo { |
| 800 regRanges = append(regRanges, messages.Regressio
nRange{ | 844 regRanges = append(regRanges, messages.Regressio
nRange{ |
| 801 Repo: repo, | 845 Repo: repo, |
| 802 Revisions: revisions, | 846 Revisions: revisions, |
| 803 }) | 847 }) |
| 804 } | 848 } |
| 805 | 849 |
| 806 // If the builder has been failing on the same step for
multiple builds in a row, | 850 // If the builder has been failing on the same step for
multiple builds in a row, |
| 807 // we should have only one alert but indicate the range
of builds affected. | 851 // we should have only one alert but indicate the range
of builds affected. |
| 808 // These are set in FirstFailure and LastFailure. | 852 // These are set in FirstFailure and LastFailure. |
| 809 bf := messages.BuildFailure{ | 853 bf := messages.BuildFailure{ |
| 810 // FIXME: group builders? | 854 // FIXME: group builders? |
| 811 Builders: []messages.AlertedBuilder{ | 855 Builders: []messages.AlertedBuilder{ |
| 812 { | 856 { |
| 813 Name: f.builderName, | 857 Name: f.builderName, |
| 814 URL: client.BuilderURL
(f.masterName, f.builderName), | 858 URL: client.BuilderURL
(f.masterName, f.builderName), |
| 815 StartTime: f.build.CreatedTi
mestamp, | 859 StartTime: f.build.CreatedTi
mestamp, |
| 816 FirstFailure: f.build.Number, | 860 FirstFailure: f.build.Number, |
| 817 LatestFailure: f.build.Number, | 861 LatestFailure: f.build.Number, |
| 818 }, | 862 }, |
| 819 }, | 863 }, |
| 820 » » » » TreeCloser: a.wouldCloseTree(f.masterName,
f.builderName, f.step.Name), | 864 » » » » TreeCloser: a.Gatekeeper.WouldCloseTree(f.
masterName, f.builderName, f.step.Name), |
| 821 RegressionRanges: regRanges, | 865 RegressionRanges: regRanges, |
| 822 } | 866 } |
| 823 | 867 |
| 868 if bf.TreeCloser { |
| 869 alr.Severity = treeCloserSev |
| 870 } |
| 871 |
| 824 reasons := a.reasonsForFailure(f) | 872 reasons := a.reasonsForFailure(f) |
| 825 for _, r := range reasons { | 873 for _, r := range reasons { |
| 826 bf.Reasons = append(bf.Reasons, messages.Reason{ | 874 bf.Reasons = append(bf.Reasons, messages.Reason{ |
| 827 TestName: r, | 875 TestName: r, |
| 828 Step: f.step.Name, | 876 Step: f.step.Name, |
| 829 URL: f.URL(), | 877 URL: f.URL(), |
| 830 }) | 878 }) |
| 831 } | 879 } |
| 832 | 880 |
| 833 alr.Key = alertKey(f.masterName, f.builderName, f.step.N
ame) | 881 alr.Key = alertKey(f.masterName, f.builderName, f.step.N
ame) |
| (...skipping 54 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 888 | 936 |
| 889 if !recognized { | 937 if !recognized { |
| 890 // TODO: log and report frequently encountered unrecognized buil
der step | 938 // TODO: log and report frequently encountered unrecognized buil
der step |
| 891 // failure names. | 939 // failure names. |
| 892 log.Errorf("Unrecognized step step failure type, unable to find
reasons: %s", f.step.Name) | 940 log.Errorf("Unrecognized step step failure type, unable to find
reasons: %s", f.step.Name) |
| 893 } | 941 } |
| 894 | 942 |
| 895 return ret | 943 return ret |
| 896 } | 944 } |
| 897 | 945 |
| 898 func (a *Analyzer) excludeFailure(master, builder, step string) bool { | |
| 899 mc, ok := a.MasterCfgs[master] | |
| 900 if !ok { | |
| 901 log.Errorf("Can't filter unknown master %s", master) | |
| 902 return false | |
| 903 } | |
| 904 | |
| 905 for _, ebName := range mc.ExcludedBuilders { | |
| 906 if ebName == "*" || ebName == builder { | |
| 907 return true | |
| 908 } | |
| 909 } | |
| 910 | |
| 911 // Not clear that builder_alerts even looks at the rest of these condtio
ns | |
| 912 // even though they're specified in gatekeeper.json | |
| 913 for _, s := range mc.ExcludedSteps { | |
| 914 if step == s { | |
| 915 return true | |
| 916 } | |
| 917 } | |
| 918 | |
| 919 bc, ok := mc.Builders[builder] | |
| 920 if !ok { | |
| 921 if bc, ok = mc.Builders["*"]; !ok { | |
| 922 log.Warningf("Unknown %s builder %s", master, builder) | |
| 923 return true | |
| 924 } | |
| 925 } | |
| 926 | |
| 927 for _, esName := range bc.ExcludedSteps { | |
| 928 if esName == step || esName == "*" { | |
| 929 return true | |
| 930 } | |
| 931 } | |
| 932 | |
| 933 return false | |
| 934 } | |
| 935 | |
| 936 func (a *Analyzer) wouldCloseTree(master, builder, step string) bool { | |
| 937 mc, ok := a.MasterCfgs[master] | |
| 938 if !ok { | |
| 939 log.Errorf("Missing master cfg: %s", master) | |
| 940 return false | |
| 941 } | |
| 942 bc, ok := mc.Builders[builder] | |
| 943 if !ok { | |
| 944 bc, ok = mc.Builders["*"] | |
| 945 if ok { | |
| 946 return true | |
| 947 } | |
| 948 } | |
| 949 | |
| 950 for _, xstep := range bc.ExcludedSteps { | |
| 951 if xstep == step { | |
| 952 return false | |
| 953 } | |
| 954 } | |
| 955 | |
| 956 csteps := []string{} | |
| 957 csteps = append(csteps, bc.ClosingSteps...) | |
| 958 csteps = append(csteps, bc.ClosingOptional...) | |
| 959 | |
| 960 for _, cs := range csteps { | |
| 961 if cs == "*" || cs == step { | |
| 962 return true | |
| 963 } | |
| 964 } | |
| 965 | |
| 966 return false | |
| 967 } | |
| 968 | |
| 969 // unexpected returns the set of expected xor actual. | 946 // unexpected returns the set of expected xor actual. |
| 970 func unexpected(expected, actual []string) []string { | 947 func unexpected(expected, actual []string) []string { |
| 971 e, a := make(map[string]bool), make(map[string]bool) | 948 e, a := make(map[string]bool), make(map[string]bool) |
| 972 for _, s := range expected { | 949 for _, s := range expected { |
| 973 e[s] = true | 950 e[s] = true |
| 974 } | 951 } |
| 975 for _, s := range actual { | 952 for _, s := range actual { |
| 976 a[s] = true | 953 a[s] = true |
| 977 } | 954 } |
| 978 | 955 |
| (...skipping 17 matching lines...) Expand all Loading... |
| 996 masterName string | 973 masterName string |
| 997 builderName string | 974 builderName string |
| 998 build messages.Build | 975 build messages.Build |
| 999 step messages.Step | 976 step messages.Step |
| 1000 } | 977 } |
| 1001 | 978 |
| 1002 // URL returns a url to builder step failure page. | 979 // URL returns a url to builder step failure page. |
| 1003 func (f stepFailure) URL() string { | 980 func (f stepFailure) URL() string { |
| 1004 return client.StepURL(f.masterName, f.builderName, f.step.Name, f.build.
Number) | 981 return client.StepURL(f.masterName, f.builderName, f.step.Name, f.build.
Number) |
| 1005 } | 982 } |
| OLD | NEW |