Chromium Code Reviews| OLD | NEW |
|---|---|
| 1 // Copyright 2015 The Chromium Authors. All rights reserved. | 1 // Copyright 2015 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 | 4 |
| 5 package analyzer | 5 package analyzer |
| 6 | 6 |
| 7 import ( | 7 import ( |
| 8 "errors" | 8 "errors" |
| 9 "expvar" | 9 "expvar" |
| 10 "fmt" | 10 "fmt" |
| 11 "regexp" | |
| 11 "sort" | 12 "sort" |
| 12 "strings" | 13 "strings" |
| 14 "sync" | |
| 13 "time" | 15 "time" |
| 14 | 16 |
| 15 "github.com/luci/luci-go/common/logging/gologger" | 17 "github.com/luci/luci-go/common/logging/gologger" |
| 16 | 18 |
| 17 "infra/monitoring/client" | 19 "infra/monitoring/client" |
| 18 "infra/monitoring/messages" | 20 "infra/monitoring/messages" |
| 19 ) | 21 ) |
| 20 | 22 |
| 21 const ( | 23 const ( |
| 22 // StepCompletedRun is a synthetic step name used to indicate the build run is complete. | 24 // StepCompletedRun is a synthetic step name used to indicate the build run is complete. |
| 23 » StepCompletedRun = "completed run" | 25 » StepCompletedRun = "completed run" |
| 24 » treeCloserPri = 0 | 26 |
| 25 » reliableFailureSev = 0 | 27 » // Order of severity, worst to least bad. |
| 26 » newFailureSev = 1 | 28 » treeCloserSev = iota |
| 27 » staleMasterSev = 0 | 29 » staleMasterSev |
| 28 » staleBuilderSev = 0 | 30 » infraFailureSev |
| 29 » hungBuilderSev = 1 | 31 » reliableFailureSev |
| 30 » idleBuilderSev = 1 | 32 » newFailureSev |
| 31 » offlineBuilderSev = 1 | 33 » staleBuilderSev |
| 32 » resOK = float64(1) | 34 » hungBuilderSev |
| 33 » resInfraFailure = float64(4) | 35 » idleBuilderSev |
| 36 » offlineBuilderSev | |
| 37 | |
| 38 » // Step result values. | |
| 39 » resOK = float64(1) | |
| 40 » resInfraFailure = float64(4) | |
| 34 ) | 41 ) |
| 35 | 42 |
| 36 var ( | 43 var ( |
| 37 log = gologger.Get() | 44 log = gologger.Get() |
| 38 expvars = expvar.NewMap("analyzer") | 45 expvars = expvar.NewMap("analyzer") |
| 46 cpRE = regexp.MustCompile("Cr-Commit-Position: (.*)@{#([0-9]+)}") | |
| 39 ) | 47 ) |
| 40 | 48 |
| 41 var ( | 49 var ( |
| 42 errNoBuildSteps = errors.New("No build steps") | 50 errNoBuildSteps = errors.New("No build steps") |
| 43 errNoRecentBuilds = errors.New("No recent builds") | 51 errNoRecentBuilds = errors.New("No recent builds") |
| 44 errNoCompletedBuilds = errors.New("No completed builds") | 52 errNoCompletedBuilds = errors.New("No completed builds") |
| 45 ) | 53 ) |
| 46 | 54 |
| 47 // StepAnalyzer reasons about a stepFailure and produces a set of reasons for th e | 55 // StepAnalyzer reasons about a stepFailure and produces a set of reasons for th e |
| 48 // failure. It also indicates whether or not it recognizes the stepFailure. | 56 // failure. It also indicates whether or not it recognizes the stepFailure. |
| (...skipping 38 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 87 OfflineBuilderThresh time.Duration | 95 OfflineBuilderThresh time.Duration |
| 88 | 96 |
| 89 // IdleBuilderCountThresh is the maximum number of builds a builder may have in queue | 97 // IdleBuilderCountThresh is the maximum number of builds a builder may have in queue |
| 90 // while in the "idle" state before triggering an "idle builder" alert. | 98 // while in the "idle" state before triggering an "idle builder" alert. |
| 91 IdleBuilderCountThresh int64 | 99 IdleBuilderCountThresh int64 |
| 92 | 100 |
| 93 // StaleMasterThreshold is the maximum age that master data from CBE can be before | 101 // StaleMasterThreshold is the maximum age that master data from CBE can be before |
| 94 // triggering a "stale master" alert. | 102 // triggering a "stale master" alert. |
| 95 StaleMasterThreshold time.Duration | 103 StaleMasterThreshold time.Duration |
| 96 | 104 |
| 97 » // MasterCfgs is a map of master name to MasterConfig | 105 » // Gatekeeper is a the parsed gatekeeper.json config file. |
| 98 » MasterCfgs map[string]messages.MasterConfig | 106 » Gatekeeper *GatekeeperRules |
| 99 | 107 |
| 100 // These limit the scope analysis, useful for debugging. | 108 // These limit the scope analysis, useful for debugging. |
| 101 MasterOnly string | 109 MasterOnly string |
| 102 BuilderOnly string | 110 BuilderOnly string |
| 103 BuildOnly int64 | 111 BuildOnly int64 |
| 104 | 112 |
| 113 rslck *sync.Mutex | |
|
martiniss
2016/04/12 22:02:54
could you put a comment saying what resources this
| |
| 105 revisionSummaries map[string]messages.RevisionSummary | 114 revisionSummaries map[string]messages.RevisionSummary |
| 106 | 115 |
| 107 // Now is useful for mocking the system clock in testing and simulating time | 116 // Now is useful for mocking the system clock in testing and simulating time |
| 108 // during replay. | 117 // during replay. |
| 109 Now func() time.Time | 118 Now func() time.Time |
| 110 } | 119 } |
| 111 | 120 |
| 112 // New returns a new Analyzer. If client is nil, it assigns a default implementa tion. | 121 // New returns a new Analyzer. If client is nil, it assigns a default implementa tion. |
| 113 // maxBuilds is the maximum number of builds to check, per builder. | 122 // maxBuilds is the maximum number of builds to check, per builder. |
| 114 func New(c client.Reader, minBuilds, maxBuilds int) *Analyzer { | 123 func New(c client.Reader, minBuilds, maxBuilds int) *Analyzer { |
| 115 if c == nil { | 124 if c == nil { |
| 116 c = client.NewReader(nil) | 125 c = client.NewReader(nil) |
| 117 } | 126 } |
| 118 | 127 |
| 119 return &Analyzer{ | 128 return &Analyzer{ |
| 120 Reader: c, | 129 Reader: c, |
| 121 MaxRecentBuilds: maxBuilds, | 130 MaxRecentBuilds: maxBuilds, |
| 122 MinRecentBuilds: minBuilds, | 131 MinRecentBuilds: minBuilds, |
| 123 HungBuilderThresh: 3 * time.Hour, | 132 HungBuilderThresh: 3 * time.Hour, |
| 124 OfflineBuilderThresh: 90 * time.Minute, | 133 OfflineBuilderThresh: 90 * time.Minute, |
| 125 IdleBuilderCountThresh: 50, | 134 IdleBuilderCountThresh: 50, |
| 126 StaleMasterThreshold: 10 * time.Minute, | 135 StaleMasterThreshold: 10 * time.Minute, |
| 127 StepAnalyzers: []StepAnalyzer{ | 136 StepAnalyzers: []StepAnalyzer{ |
| 128 &TestFailureAnalyzer{Reader: c}, | 137 &TestFailureAnalyzer{Reader: c}, |
| 129 &CompileFailureAnalyzer{Reader: c}, | 138 &CompileFailureAnalyzer{Reader: c}, |
| 130 }, | 139 }, |
| 131 » » MasterCfgs: map[string]messages.MasterConfig{}, | 140 » » Gatekeeper: &GatekeeperRules{}, |
| 132 | 141 » » rslck: &sync.Mutex{}, |
|
martiniss
2016/04/12 22:02:54
could you put a comment saying what resources this
seanmccullough
2016/04/12 22:13:42
Done.
| |
| 133 revisionSummaries: map[string]messages.RevisionSummary{}, | 142 revisionSummaries: map[string]messages.RevisionSummary{}, |
| 134 Now: func() time.Time { | 143 Now: func() time.Time { |
| 135 return time.Now() | 144 return time.Now() |
| 136 }, | 145 }, |
| 137 } | 146 } |
| 138 } | 147 } |
| 139 | 148 |
| 140 // MasterAlerts returns alerts generated from the master. | 149 // MasterAlerts returns alerts generated from the master. |
| 141 func (a *Analyzer) MasterAlerts(master string, be *messages.BuildExtract) []mess ages.Alert { | 150 func (a *Analyzer) MasterAlerts(master string, be *messages.BuildExtract) []mess ages.Alert { |
| 142 ret := []messages.Alert{} | 151 ret := []messages.Alert{} |
| 143 | 152 |
| 144 // Copied logic from builder_messages. | 153 // Copied logic from builder_messages. |
| 145 // No created_timestamp should be a warning sign, no? | 154 // No created_timestamp should be a warning sign, no? |
| 146 if be.CreatedTimestamp == messages.EpochTime(0) { | 155 if be.CreatedTimestamp == messages.EpochTime(0) { |
| 147 return ret | 156 return ret |
| 148 } | 157 } |
| 149 expvars.Add("MasterAlerts", 1) | 158 expvars.Add("MasterAlerts", 1) |
| 150 defer expvars.Add("MasterAlerts", -1) | 159 defer expvars.Add("MasterAlerts", -1) |
| 151 elapsed := a.Now().Sub(be.CreatedTimestamp.Time()) | 160 elapsed := a.Now().Sub(be.CreatedTimestamp.Time()) |
| 152 if elapsed > a.StaleMasterThreshold { | 161 if elapsed > a.StaleMasterThreshold { |
| 153 ret = append(ret, messages.Alert{ | 162 ret = append(ret, messages.Alert{ |
| 154 Key: fmt.Sprintf("stale master: %v", master), | 163 Key: fmt.Sprintf("stale master: %v", master), |
| 155 Title: fmt.Sprintf("Stale %s master data", master), | 164 Title: fmt.Sprintf("Stale %s master data", master), |
| 156 » » » Body: fmt.Sprintf("%s elapsed since last update.", elapsed), | 165 » » » Body: fmt.Sprintf("%dh %2dm elapsed since last upda te.", int(elapsed.Hours()), int(elapsed.Minutes())), |
| 157 StartTime: messages.TimeToEpochTime(be.CreatedTimestamp. Time()), | 166 StartTime: messages.TimeToEpochTime(be.CreatedTimestamp. Time()), |
| 158 Severity: staleMasterSev, | 167 Severity: staleMasterSev, |
| 159 Time: messages.TimeToEpochTime(a.Now()), | 168 Time: messages.TimeToEpochTime(a.Now()), |
| 160 Links: []messages.Link{{"Master", client.MasterURL(m aster)}}, | 169 Links: []messages.Link{{"Master", client.MasterURL(m aster)}}, |
| 161 » » » // No type or extension for now. | 170 » » » Type: "stale-master", |
| 171 » » » // No extension for now. | |
| 162 }) | 172 }) |
| 163 } | 173 } |
| 164 if elapsed < 0 { | 174 if elapsed < 0 { |
| 165 // Add this to the alerts returned, rather than just log it? | 175 // Add this to the alerts returned, rather than just log it? |
| 166 log.Errorf("Master %s timestamp is newer than current time (%s): %s old.", master, a.Now(), elapsed) | 176 log.Errorf("Master %s timestamp is newer than current time (%s): %s old.", master, a.Now(), elapsed) |
| 167 } | 177 } |
| 168 | 178 |
| 169 return ret | 179 return ret |
| 170 } | 180 } |
| 171 | 181 |
| (...skipping 158 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 330 switch b.State { | 340 switch b.State { |
| 331 case messages.StateBuilding: | 341 case messages.StateBuilding: |
| 332 if elapsed > a.HungBuilderThresh && lastStep != StepCompletedRun { | 342 if elapsed > a.HungBuilderThresh && lastStep != StepCompletedRun { |
| 333 alerts = append(alerts, messages.Alert{ | 343 alerts = append(alerts, messages.Alert{ |
| 334 Key: fmt.Sprintf("%s.%s.hung", masterName, builderName), | 344 Key: fmt.Sprintf("%s.%s.hung", masterName, builderName), |
| 335 Title: fmt.Sprintf("%s.%s is hung in step %s. ", masterName, builderName, lastStep), | 345 Title: fmt.Sprintf("%s.%s is hung in step %s. ", masterName, builderName, lastStep), |
| 336 Body: fmt.Sprintf("%s.%s has been building f or %v (last step update %s), past the alerting threshold of %v", masterName, bui lderName, elapsed, lastUpdated.Time(), a.HungBuilderThresh), | 346 Body: fmt.Sprintf("%s.%s has been building f or %v (last step update %s), past the alerting threshold of %v", masterName, bui lderName, elapsed, lastUpdated.Time(), a.HungBuilderThresh), |
| 337 Severity: hungBuilderSev, | 347 Severity: hungBuilderSev, |
| 338 Time: messages.TimeToEpochTime(a.Now()), | 348 Time: messages.TimeToEpochTime(a.Now()), |
| 339 Links: links, | 349 Links: links, |
| 350 Type: "hung-builder", | |
| 340 }) | 351 }) |
| 341 // Note, just because it's building doesn't mean it's in a good state. If the last N builds | 352 // Note, just because it's building doesn't mean it's in a good state. If the last N builds |
| 342 // all failed (for some large N) then this might still b e alertable. | 353 // all failed (for some large N) then this might still b e alertable. |
| 343 } | 354 } |
| 344 case messages.StateOffline: | 355 case messages.StateOffline: |
| 345 if elapsed > a.OfflineBuilderThresh { | 356 if elapsed > a.OfflineBuilderThresh { |
| 346 alerts = append(alerts, messages.Alert{ | 357 alerts = append(alerts, messages.Alert{ |
| 347 Key: fmt.Sprintf("%s.%s.offline", masterNam e, builderName), | 358 Key: fmt.Sprintf("%s.%s.offline", masterNam e, builderName), |
| 348 Title: fmt.Sprintf("%s.%s is offline.", maste rName, builderName), | 359 Title: fmt.Sprintf("%s.%s is offline.", maste rName, builderName), |
| 349 Body: fmt.Sprintf("%s.%s has been offline fo r %v (last step update %s %v), past the alerting threshold of %v", masterName, b uilderName, elapsed, lastUpdated.Time(), float64(lastUpdated), a.OfflineBuilderT hresh), | 360 Body: fmt.Sprintf("%s.%s has been offline fo r %v (last step update %s %v), past the alerting threshold of %v", masterName, b uilderName, elapsed, lastUpdated.Time(), float64(lastUpdated), a.OfflineBuilderT hresh), |
| 350 Severity: offlineBuilderSev, | 361 Severity: offlineBuilderSev, |
| 351 Time: messages.TimeToEpochTime(a.Now()), | 362 Time: messages.TimeToEpochTime(a.Now()), |
| 352 Links: links, | 363 Links: links, |
| 364 Type: "offline-builder", | |
| 353 }) | 365 }) |
| 354 } | 366 } |
| 355 case messages.StateIdle: | 367 case messages.StateIdle: |
| 356 if b.PendingBuilds > a.IdleBuilderCountThresh { | 368 if b.PendingBuilds > a.IdleBuilderCountThresh { |
| 357 alerts = append(alerts, messages.Alert{ | 369 alerts = append(alerts, messages.Alert{ |
| 358 Key: fmt.Sprintf("%s.%s.idle", masterName, builderName), | 370 Key: fmt.Sprintf("%s.%s.idle", masterName, builderName), |
| 359 Title: fmt.Sprintf("%s.%s is idle with too ma ny pending builds.", masterName, builderName), | 371 Title: fmt.Sprintf("%s.%s is idle with too ma ny pending builds.", masterName, builderName), |
| 360 Body: fmt.Sprintf("%s.%s is idle with %d pen ding builds, past the alerting threshold of %d", masterName, builderName, b.Pend ingBuilds, a.IdleBuilderCountThresh), | 372 Body: fmt.Sprintf("%s.%s is idle with %d pen ding builds, past the alerting threshold of %d", masterName, builderName, b.Pend ingBuilds, a.IdleBuilderCountThresh), |
| 361 Severity: idleBuilderSev, | 373 Severity: idleBuilderSev, |
| 362 Time: messages.TimeToEpochTime(a.Now()), | 374 Time: messages.TimeToEpochTime(a.Now()), |
| 363 Links: links, | 375 Links: links, |
| 376 Type: "idle-builder", | |
| 364 }) | 377 }) |
| 365 } | 378 } |
| 366 default: | 379 default: |
| 367 log.Errorf("Unknown %s.%s builder state: %s", masterName, builde rName, b.State) | 380 log.Errorf("Unknown %s.%s builder state: %s", masterName, builde rName, b.State) |
| 368 } | 381 } |
| 369 | 382 |
| 370 // Check for alerts on the most recent complete build | 383 // Check for alerts on the most recent complete build |
| 371 log.Infof("Checking %d most recent builds for alertable step failures: % s/%s", len(recentBuildIDs), masterName, builderName) | 384 log.Infof("Checking %d most recent builds for alertable step failures: % s/%s", len(recentBuildIDs), masterName, builderName) |
| 372 as, es := a.builderStepAlerts(masterName, builderName, []int64{lastCompl etedBuild.Number}) | 385 as, es := a.builderStepAlerts(masterName, builderName, []int64{lastCompl etedBuild.Number}) |
| 373 | 386 |
| (...skipping 113 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 487 mergedBF.RegressionRanges = append(mergedBF.RegressionRa nges, messages.RegressionRange{ | 500 mergedBF.RegressionRanges = append(mergedBF.RegressionRa nges, messages.RegressionRange{ |
| 488 Repo: repo, | 501 Repo: repo, |
| 489 Positions: uniques(pos), | 502 Positions: uniques(pos), |
| 490 Revisions: uniques(revs), | 503 Revisions: uniques(revs), |
| 491 }) | 504 }) |
| 492 } | 505 } |
| 493 | 506 |
| 494 sort.Sort(byRepo(mergedBF.RegressionRanges)) | 507 sort.Sort(byRepo(mergedBF.RegressionRanges)) |
| 495 | 508 |
| 496 if len(mergedBF.Builders) > 1 { | 509 if len(mergedBF.Builders) > 1 { |
| 497 » » » merged.Title = fmt.Sprintf("%s (failing on %d builders)" , step, len(mergedBF.Builders)) | 510 » » » merged.Title = fmt.Sprintf("%s failing on %d builders", step, len(mergedBF.Builders)) |
| 511 » » » builderNames := []string{} | |
| 512 » » » for _, b := range mergedBF.Builders { | |
| 513 » » » » builderNames = append(builderNames, b.Name) | |
| 514 » » » } | |
| 515 » » » merged.Body = strings.Join(builderNames, ", ") | |
| 498 } | 516 } |
| 499 merged.Extension = mergedBF | 517 merged.Extension = mergedBF |
| 500 mergedAlerts = append(mergedAlerts, merged) | 518 mergedAlerts = append(mergedAlerts, merged) |
| 501 } | 519 } |
| 502 | 520 |
| 503 return mergedAlerts | 521 return mergedAlerts |
| 504 } | 522 } |
| 505 | 523 |
| 506 // GetRevisionSummaries returns a slice of RevisionSummaries for the list of has hes. | 524 // GetRevisionSummaries returns a slice of RevisionSummaries for the list of has hes. |
| 507 func (a *Analyzer) GetRevisionSummaries(hashes []string) ([]messages.RevisionSum mary, error) { | 525 func (a *Analyzer) GetRevisionSummaries(hashes []string) ([]messages.RevisionSum mary, error) { |
| 508 ret := []messages.RevisionSummary{} | 526 ret := []messages.RevisionSummary{} |
| 509 for _, h := range hashes { | 527 for _, h := range hashes { |
| 528 a.rslck.Lock() | |
| 510 s, ok := a.revisionSummaries[h] | 529 s, ok := a.revisionSummaries[h] |
| 530 a.rslck.Unlock() | |
| 511 if !ok { | 531 if !ok { |
| 512 return nil, fmt.Errorf("Unrecognized hash: %s", h) | 532 return nil, fmt.Errorf("Unrecognized hash: %s", h) |
| 513 } | 533 } |
| 514 ret = append(ret, s) | 534 ret = append(ret, s) |
| 515 } | 535 } |
| 516 | 536 |
| 517 return ret, nil | 537 return ret, nil |
| 518 } | 538 } |
| 519 | 539 |
| 520 // builderStepAlerts scans the steps of recent builds done on a particular build er, | 540 // builderStepAlerts scans the steps of recent builds done on a particular build er, |
| (...skipping 186 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 707 // goroutine/channel because the reasonsForFailure call potentia lly | 727 // goroutine/channel because the reasonsForFailure call potentia lly |
| 708 // blocks on IO. | 728 // blocks on IO. |
| 709 if failure.step.Name == "steps" { | 729 if failure.step.Name == "steps" { |
| 710 // check results to see if it's an array of [4] | 730 // check results to see if it's an array of [4] |
| 711 // That's a purple failure, which should go to infra/tro oper. | 731 // That's a purple failure, which should go to infra/tro oper. |
| 712 log.Infof("steps results: %+v", failure.step) | 732 log.Infof("steps results: %+v", failure.step) |
| 713 if len(failure.step.Results) > 0 { | 733 if len(failure.step.Results) > 0 { |
| 714 if r, ok := failure.step.Results[0].(float64); o k && r == resInfraFailure { | 734 if r, ok := failure.step.Results[0].(float64); o k && r == resInfraFailure { |
| 715 // TODO: Create a trooper alert about th is. | 735 // TODO: Create a trooper alert about th is. |
| 716 log.Errorf("INFRA FAILURE: %+v", failure ) | 736 log.Errorf("INFRA FAILURE: %+v", failure ) |
| 737 alr := messages.Alert{ | |
| 738 Title: fmt.Sprintf("%s infra failure", failure.builderName), | |
| 739 Body: fmt.Sprintf("On step % s", failure.step.Name), | |
| 740 Type: "infra-failure", | |
| 741 Severity: infraFailureSev, | |
| 742 } | |
| 743 rs <- res{ | |
| 744 f: failure, | |
| 745 a: &alr, | |
| 746 err: nil, | |
| 747 } | |
| 717 } | 748 } |
| 718 } | 749 } |
| 719 continue | 750 continue |
| 720 // The actual breaking step will appear later. | 751 // The actual breaking step will appear later. |
| 721 } | 752 } |
| 722 | 753 |
| 723 // Check the gatekeeper configs to see if this is ignorable. | 754 // Check the gatekeeper configs to see if this is ignorable. |
| 724 » » if a.excludeFailure(failure.masterName, failure.builderName, fai lure.step.Name) { | 755 » » if a.Gatekeeper.ExcludeFailure(failure.masterName, failure.build erName, failure.step.Name) { |
| 725 continue | 756 continue |
| 726 } | 757 } |
| 727 | 758 |
| 728 // Gets the named revision number from gnumbd metadata. | 759 // Gets the named revision number from gnumbd metadata. |
| 729 getCommitPos := func(b messages.Build, name string) (string, boo l) { | 760 getCommitPos := func(b messages.Build, name string) (string, boo l) { |
| 730 for _, p := range b.Properties { | 761 for _, p := range b.Properties { |
| 731 if p[0] == name { | 762 if p[0] == name { |
| 732 s, ok := p[1].(string) | 763 s, ok := p[1].(string) |
| 733 return s, ok | 764 return s, ok |
| 734 } | 765 } |
| 735 } | 766 } |
| 736 return "", false | 767 return "", false |
| 737 } | 768 } |
| 738 | 769 |
| 739 scannedFailures = append(scannedFailures, failure) | 770 scannedFailures = append(scannedFailures, failure) |
| 740 go func(f stepFailure) { | 771 go func(f stepFailure) { |
| 741 expvars.Add("StepFailures", 1) | 772 expvars.Add("StepFailures", 1) |
| 742 defer expvars.Add("StepFailures", -1) | 773 defer expvars.Add("StepFailures", -1) |
| 743 alr := messages.Alert{ | 774 alr := messages.Alert{ |
| 744 » » » » Title: fmt.Sprintf("Builder step failure: %s.%s" , f.masterName, f.builderName), | 775 » » » » Title: fmt.Sprintf("%s step failure", f.build erName), |
| 745 » » » » Time: messages.EpochTime(a.Now().Unix()), | 776 » » » » Body: fmt.Sprintf("%s failing on %s/%s", f.s tep.Name, f.masterName, f.builderName), |
| 746 » » » » Type: "buildfailure", | 777 » » » » Time: messages.EpochTime(a.Now().Unix()), |
| 778 » » » » Type: "buildfailure", | |
| 779 » » » » Severity: newFailureSev, | |
| 747 } | 780 } |
| 748 | 781 |
| 749 regRanges := []messages.RegressionRange{} | 782 regRanges := []messages.RegressionRange{} |
| 750 revisionsByRepo := map[string][]string{} | 783 revisionsByRepo := map[string][]string{} |
| 751 | 784 |
| 752 // Get gnumbd sequence numbers for whatever this build p ulled in. | 785 // Get gnumbd sequence numbers for whatever this build p ulled in. |
| 753 chromiumPos, ok := getCommitPos(f.build, "got_revision_c p") | 786 chromiumPos, ok := getCommitPos(f.build, "got_revision_c p") |
| 754 if ok { | 787 if ok { |
| 755 regRanges = append(regRanges, messages.Regressio nRange{ | 788 regRanges = append(regRanges, messages.Regressio nRange{ |
| 756 Repo: "chromium", | 789 Repo: "chromium", |
| (...skipping 23 matching lines...) Expand all Loading... | |
| 780 Repo: "nacl", | 813 Repo: "nacl", |
| 781 Positions: []string{naclPos}, | 814 Positions: []string{naclPos}, |
| 782 }) | 815 }) |
| 783 } | 816 } |
| 784 | 817 |
| 785 for _, change := range f.build.SourceStamp.Changes { | 818 for _, change := range f.build.SourceStamp.Changes { |
| 786 revisionsByRepo[change.Repository] = append(revi sionsByRepo[change.Repository], change.Revision) | 819 revisionsByRepo[change.Repository] = append(revi sionsByRepo[change.Repository], change.Revision) |
| 787 // change.Revision is *not* always a git hash. S ometimes it is a position from gnumbd. | 820 // change.Revision is *not* always a git hash. S ometimes it is a position from gnumbd. |
| 788 // change.Revision is git hash or gnumbd dependi ng on what exactly? Not obvious at this time. | 821 // change.Revision is git hash or gnumbd dependi ng on what exactly? Not obvious at this time. |
| 789 // A potential problem here is when multiple rep os have overlapping gnumbd ranges. | 822 // A potential problem here is when multiple rep os have overlapping gnumbd ranges. |
| 823 parts := cpRE.FindAllStringSubmatch(change.Comme nts, -1) | |
| 824 pos, branch := "", "" | |
| 825 if len(parts) > 0 { | |
| 826 branch = parts[0][1] | |
| 827 pos = parts[0][2] | |
| 828 } | |
| 829 a.rslck.Lock() | |
| 790 a.revisionSummaries[change.Revision] = messages. RevisionSummary{ | 830 a.revisionSummaries[change.Revision] = messages. RevisionSummary{ |
| 791 GitHash: change.Revision, | 831 GitHash: change.Revision, |
| 792 Link: change.Revlink, | 832 Link: change.Revlink, |
| 793 Description: trunc(change.Comments), | 833 Description: trunc(change.Comments), |
| 794 Author: change.Who, | 834 Author: change.Who, |
| 795 When: change.When, | 835 When: change.When, |
| 836 Position: pos, | |
| 837 Branch: branch, | |
| 796 } | 838 } |
| 839 a.rslck.Unlock() | |
| 797 } | 840 } |
| 798 | 841 |
| 799 for repo, revisions := range revisionsByRepo { | 842 for repo, revisions := range revisionsByRepo { |
| 800 regRanges = append(regRanges, messages.Regressio nRange{ | 843 regRanges = append(regRanges, messages.Regressio nRange{ |
| 801 Repo: repo, | 844 Repo: repo, |
| 802 Revisions: revisions, | 845 Revisions: revisions, |
| 803 }) | 846 }) |
| 804 } | 847 } |
| 805 | 848 |
| 806 // If the builder has been failing on the same step for multiple builds in a row, | 849 // If the builder has been failing on the same step for multiple builds in a row, |
| 807 // we should have only one alert but indicate the range of builds affected. | 850 // we should have only one alert but indicate the range of builds affected. |
| 808 // These are set in FirstFailure and LastFailure. | 851 // These are set in FirstFailure and LastFailure. |
| 809 bf := messages.BuildFailure{ | 852 bf := messages.BuildFailure{ |
| 810 // FIXME: group builders? | 853 // FIXME: group builders? |
| 811 Builders: []messages.AlertedBuilder{ | 854 Builders: []messages.AlertedBuilder{ |
| 812 { | 855 { |
| 813 Name: f.builderName, | 856 Name: f.builderName, |
| 814 URL: client.BuilderURL (f.masterName, f.builderName), | 857 URL: client.BuilderURL (f.masterName, f.builderName), |
| 815 StartTime: f.build.CreatedTi mestamp, | 858 StartTime: f.build.CreatedTi mestamp, |
| 816 FirstFailure: f.build.Number, | 859 FirstFailure: f.build.Number, |
| 817 LatestFailure: f.build.Number, | 860 LatestFailure: f.build.Number, |
| 818 }, | 861 }, |
| 819 }, | 862 }, |
| 820 » » » » TreeCloser: a.wouldCloseTree(f.masterName, f.builderName, f.step.Name), | 863 » » » » TreeCloser: a.Gatekeeper.WouldCloseTree(f. masterName, f.builderName, f.step.Name), |
| 821 RegressionRanges: regRanges, | 864 RegressionRanges: regRanges, |
| 822 } | 865 } |
| 823 | 866 |
| 867 if bf.TreeCloser { | |
| 868 alr.Severity = treeCloserSev | |
| 869 } | |
| 870 | |
| 824 reasons := a.reasonsForFailure(f) | 871 reasons := a.reasonsForFailure(f) |
| 825 for _, r := range reasons { | 872 for _, r := range reasons { |
| 826 bf.Reasons = append(bf.Reasons, messages.Reason{ | 873 bf.Reasons = append(bf.Reasons, messages.Reason{ |
| 827 TestName: r, | 874 TestName: r, |
| 828 Step: f.step.Name, | 875 Step: f.step.Name, |
| 829 URL: f.URL(), | 876 URL: f.URL(), |
| 830 }) | 877 }) |
| 831 } | 878 } |
| 832 | 879 |
| 833 alr.Key = alertKey(f.masterName, f.builderName, f.step.N ame) | 880 alr.Key = alertKey(f.masterName, f.builderName, f.step.N ame) |
| (...skipping 54 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 888 | 935 |
| 889 if !recognized { | 936 if !recognized { |
| 890 // TODO: log and report frequently encountered unrecognized buil der step | 937 // TODO: log and report frequently encountered unrecognized buil der step |
| 891 // failure names. | 938 // failure names. |
| 892 log.Errorf("Unrecognized step step failure type, unable to find reasons: %s", f.step.Name) | 939 log.Errorf("Unrecognized step step failure type, unable to find reasons: %s", f.step.Name) |
| 893 } | 940 } |
| 894 | 941 |
| 895 return ret | 942 return ret |
| 896 } | 943 } |
| 897 | 944 |
| 898 func (a *Analyzer) excludeFailure(master, builder, step string) bool { | |
| 899 mc, ok := a.MasterCfgs[master] | |
| 900 if !ok { | |
| 901 log.Errorf("Can't filter unknown master %s", master) | |
| 902 return false | |
| 903 } | |
| 904 | |
| 905 for _, ebName := range mc.ExcludedBuilders { | |
| 906 if ebName == "*" || ebName == builder { | |
| 907 return true | |
| 908 } | |
| 909 } | |
| 910 | |
| 911 // Not clear that builder_alerts even looks at the rest of these condtio ns | |
| 912 // even though they're specified in gatekeeper.json | |
| 913 for _, s := range mc.ExcludedSteps { | |
| 914 if step == s { | |
| 915 return true | |
| 916 } | |
| 917 } | |
| 918 | |
| 919 bc, ok := mc.Builders[builder] | |
| 920 if !ok { | |
| 921 if bc, ok = mc.Builders["*"]; !ok { | |
| 922 log.Warningf("Unknown %s builder %s", master, builder) | |
| 923 return true | |
| 924 } | |
| 925 } | |
| 926 | |
| 927 for _, esName := range bc.ExcludedSteps { | |
| 928 if esName == step || esName == "*" { | |
| 929 return true | |
| 930 } | |
| 931 } | |
| 932 | |
| 933 return false | |
| 934 } | |
| 935 | |
| 936 func (a *Analyzer) wouldCloseTree(master, builder, step string) bool { | |
| 937 mc, ok := a.MasterCfgs[master] | |
| 938 if !ok { | |
| 939 log.Errorf("Missing master cfg: %s", master) | |
| 940 return false | |
| 941 } | |
| 942 bc, ok := mc.Builders[builder] | |
| 943 if !ok { | |
| 944 bc, ok = mc.Builders["*"] | |
| 945 if ok { | |
| 946 return true | |
| 947 } | |
| 948 } | |
| 949 | |
| 950 for _, xstep := range bc.ExcludedSteps { | |
| 951 if xstep == step { | |
| 952 return false | |
| 953 } | |
| 954 } | |
| 955 | |
| 956 csteps := []string{} | |
| 957 csteps = append(csteps, bc.ClosingSteps...) | |
| 958 csteps = append(csteps, bc.ClosingOptional...) | |
| 959 | |
| 960 for _, cs := range csteps { | |
| 961 if cs == "*" || cs == step { | |
| 962 return true | |
| 963 } | |
| 964 } | |
| 965 | |
| 966 return false | |
| 967 } | |
| 968 | |
| 969 // unexpected returns the set of expected xor actual. | 945 // unexpected returns the set of expected xor actual. |
| 970 func unexpected(expected, actual []string) []string { | 946 func unexpected(expected, actual []string) []string { |
| 971 e, a := make(map[string]bool), make(map[string]bool) | 947 e, a := make(map[string]bool), make(map[string]bool) |
| 972 for _, s := range expected { | 948 for _, s := range expected { |
| 973 e[s] = true | 949 e[s] = true |
| 974 } | 950 } |
| 975 for _, s := range actual { | 951 for _, s := range actual { |
| 976 a[s] = true | 952 a[s] = true |
| 977 } | 953 } |
| 978 | 954 |
| (...skipping 17 matching lines...) Expand all Loading... | |
| 996 masterName string | 972 masterName string |
| 997 builderName string | 973 builderName string |
| 998 build messages.Build | 974 build messages.Build |
| 999 step messages.Step | 975 step messages.Step |
| 1000 } | 976 } |
| 1001 | 977 |
| 1002 // URL returns a url to builder step failure page. | 978 // URL returns a url to builder step failure page. |
| 1003 func (f stepFailure) URL() string { | 979 func (f stepFailure) URL() string { |
| 1004 return client.StepURL(f.masterName, f.builderName, f.step.Name, f.build. Number) | 980 return client.StepURL(f.masterName, f.builderName, f.step.Name, f.build. Number) |
| 1005 } | 981 } |
| OLD | NEW |