go/src/infra/monitoring/analyzer/analyzer.go - Issue 1125263004: dispatcher: fix test result parsing, build ranges for failure alerts, other fixes

Side by Side Diff: go/src/infra/monitoring/analyzer/analyzer.go

Issue 1125263004: dispatcher: fix test result parsing, build ranges for failure alerts, other fixes (Closed) Base URL: https://chromium.googlesource.com/infra/infra.git@master

Patch Set: fixed comments Created 5 years, 7 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 // Copyright 2015 The Chromium Authors. All rights reserved.	1 // Copyright 2015 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 package analyzer	5 package analyzer

6	6

7 import (	7 import (

8 "errors"	8 "errors"

9 "fmt"	9 "fmt"

10 "net/url"	10 "net/url"

11 "path/filepath"	11 "path/filepath"

12 "sort"	12 "sort"

13 "strings"	13 "strings"

	14 "sync"

14 "time"	15 "time"

15	16

16 "github.com/Sirupsen/logrus"	17 "github.com/Sirupsen/logrus"

17	18

18 "infra/monitoring/client"	19 "infra/monitoring/client"

19 "infra/monitoring/messages"	20 "infra/monitoring/messages"

20 )	21 )

21	22

22 const (	23 const (

23 // StepCompletedRun is a synthetic step name used to indicate the build run is complete.	24 // StepCompletedRun is a synthetic step name used to indicate the build run is complete.

24 » StepCompletedRun = "completed run"	25 » StepCompletedRun = "completed run"

	26 » treeCloserPri = 0

	27 » staleMasterSev = 0

	28 » staleBuilderSev = 0

	29 » hungBuilderSev = 1

	30 » idleBuilderSev = 1

	31 » offlineBuilderSev = 1

25 )	32 )

26	33

27 var (	34 var (

28 log = logrus.New()	35 log = logrus.New()

29 )	36 )

30	37

31 var (	38 var (

32 errNoBuildSteps = errors.New("No build steps")	39 errNoBuildSteps = errors.New("No build steps")

33 )	40 )

34	41

(...skipping 14 matching lines...) Expand all Loading...
49 // Reasons lists the reasons for the stepFailure determined by the StepF ailureAnalyzer.	56 // Reasons lists the reasons for the stepFailure determined by the StepF ailureAnalyzer.

50 Reasons []string	57 Reasons []string

51 }	58 }

52	59

53 // MasterAnalyzer runs the process of checking masters, builders, test results a nd so on,	60 // MasterAnalyzer runs the process of checking masters, builders, test results a nd so on,

54 // in order to produce alerts.	61 // in order to produce alerts.

55 type MasterAnalyzer struct {	62 type MasterAnalyzer struct {

56 // MaxRecentBuilds is the maximum number of recent builds to check, per builder.	63 // MaxRecentBuilds is the maximum number of recent builds to check, per builder.

57 MaxRecentBuilds int	64 MaxRecentBuilds int

58	65

	66 // MinRecentBuilds is the minimum number of recent builds to check, per builder.

	67 MinRecentBuilds int

	68

59 // StepAnalzers are the set of build step failure analyzers to be checke d on	69 // StepAnalzers are the set of build step failure analyzers to be checke d on

60 // build step failures.	70 // build step failures.

61 StepAnalyzers []StepAnalyzer	71 StepAnalyzers []StepAnalyzer

62	72

63 // Client is the Client implementation for fetching json from CBE, build s, etc.	73 // Client is the Client implementation for fetching json from CBE, build s, etc.

64 Client client.Client	74 Client client.Client

65	75

66 // HungBuilerThresh is the maxumum length of time a builder may be in st ate "building"	76 // HungBuilerThresh is the maxumum length of time a builder may be in st ate "building"

67 // before triggering a "hung builder" alert.	77 // before triggering a "hung builder" alert.

68 HungBuilderThresh time.Duration	78 HungBuilderThresh time.Duration

69	79

70 // OfflineBuilderThresh is the maximum length of time a builder may be i n state "offline"	80 // OfflineBuilderThresh is the maximum length of time a builder may be i n state "offline"

71 // before triggering an "offline builder" alert.	81 // before triggering an "offline builder" alert.

72 OfflineBuilderThresh time.Duration	82 OfflineBuilderThresh time.Duration

73	83

74 // IdleBuilderCountThresh is the maximum number of builds a builder may have in queue	84 // IdleBuilderCountThresh is the maximum number of builds a builder may have in queue

75 // while in the "idle" state before triggering an "idle builder" alert.	85 // while in the "idle" state before triggering an "idle builder" alert.

76 IdleBuilderCountThresh int64	86 IdleBuilderCountThresh int64

77	87

78 // StaleMasterThreshold is the maximum age that master data from CBE can be before	88 // StaleMasterThreshold is the maximum age that master data from CBE can be before

79 // triggering a "stale master" alert.	89 // triggering a "stale master" alert.

80 StaleMasterThreshold time.Duration	90 StaleMasterThreshold time.Duration

81	91

82 // bCache is a map of build cache key to Build message.	92 // bCache is a map of build cache key to Build message.

83 bCache map[string]*messages.Builds	93 bCache map[string]*messages.Builds

	94 // bLock protects bCache

	95 bLock *sync.Mutex

	96

	97 // These limit the scope analysis, useful for debugging.

	98 TreeOnly string

	99 MasterOnly string

	100 BuilderOnly string

	101 BuildOnly int64

84	102

85 // now is useful for mocking the system clock in testing.	103 // now is useful for mocking the system clock in testing.

86 now func() time.Time	104 now func() time.Time

87 }	105 }

88	106

89 // New returns a new Analyzer. If client is nil, it assigns a default implementa tion.	107 // New returns a new Analyzer. If client is nil, it assigns a default implementa tion.

90 // maxBuilds is the maximum number of builds to check, per builder.	108 // maxBuilds is the maximum number of builds to check, per builder.

91 func New(c client.Client, maxBuilds int) *MasterAnalyzer {	109 func New(c client.Client, minBuilds, maxBuilds int) *MasterAnalyzer {

92 if c == nil {	110 if c == nil {

93 c = client.New()	111 c = client.New()

94 }	112 }

95	113

96 return &MasterAnalyzer{	114 return &MasterAnalyzer{

97 Client: c,	115 Client: c,

98 MaxRecentBuilds: maxBuilds,	116 MaxRecentBuilds: maxBuilds,

	117 MinRecentBuilds: minBuilds,

99 HungBuilderThresh: 3 * time.Hour,	118 HungBuilderThresh: 3 * time.Hour,

100 OfflineBuilderThresh: 90 * time.Minute,	119 OfflineBuilderThresh: 90 * time.Minute,

101 IdleBuilderCountThresh: 50,	120 IdleBuilderCountThresh: 50,

102 StaleMasterThreshold: 10 * time.Minute,	121 StaleMasterThreshold: 10 * time.Minute,

103 StepAnalyzers: []StepAnalyzer{	122 StepAnalyzers: []StepAnalyzer{

104 &TestFailureAnalyzer{Client: c},	123 &TestFailureAnalyzer{Client: c},

	124 &CompileFailureAnalyzer{Client: c},

105 },	125 },

106	126

107 now: func() time.Time {	127 now: func() time.Time {

108 return time.Now()	128 return time.Now()

109 },	129 },

110	130

111 bCache: map[string]*messages.Builds{},	131 bCache: map[string]*messages.Builds{},

	132 bLock: &sync.Mutex{},

112 }	133 }

113 }	134 }

114	135

115 // MasterAlerts returns alerts generated from the master at URL.	136 // MasterAlerts returns alerts generated from the master at URL.

116 func (a MasterAnalyzer) MasterAlerts(url string, be messages.BuildExtract) []m essages.Alert {	137 func (a MasterAnalyzer) MasterAlerts(url string, be messages.BuildExtract) []m essages.Alert {

117 ret := []messages.Alert{}	138 ret := []messages.Alert{}

118	139

119 // Copied logic from builder_messages.	140 // Copied logic from builder_messages.

120 // No created_timestamp should be a warning sign, no?	141 // No created_timestamp should be a warning sign, no?

121 if be.CreatedTimestamp == messages.EpochTime(0) {	142 if be.CreatedTimestamp == messages.EpochTime(0) {

122 return ret	143 return ret

123 }	144 }

124	145

125 elapsed := a.now().Sub(be.CreatedTimestamp.Time())	146 elapsed := a.now().Sub(be.CreatedTimestamp.Time())

126 if elapsed > a.StaleMasterThreshold {	147 if elapsed > a.StaleMasterThreshold {

127 ret = append(ret, messages.Alert{	148 ret = append(ret, messages.Alert{

128 Key: fmt.Sprintf("stale master: %v", url),	149 Key: fmt.Sprintf("stale master: %v", url),

129 Title: "Stale Master Data",	150 Title: "Stale Master Data",

130 Body: fmt.Sprintf("%s elapsed since last update (%s) .", elapsed, be.CreatedTimestamp.Time()),	151 Body: fmt.Sprintf("%s elapsed since last update (%s) .", elapsed, be.CreatedTimestamp.Time()),

131 » » » Severity: 0,	152 » » » Severity: staleMasterSev,

132 Time: messages.TimeToEpochTime(a.now()),	153 Time: messages.TimeToEpochTime(a.now()),

133 Links: []messages.Link{{"Master", url}},	154 Links: []messages.Link{{"Master", url}},

134 // No type or extension for now.	155 // No type or extension for now.

135 })	156 })

136 }	157 }

137 if elapsed < 0 {	158 if elapsed < 0 {

138 // Add this to the alerts returned, rather than just log it?	159 // Add this to the alerts returned, rather than just log it?

139 log.Errorf("Master %s timestamp is newer than current time (%s): %s old.", url, a.now(), elapsed)	160 log.Errorf("Master %s timestamp is newer than current time (%s): %s old.", url, a.now(), elapsed)

140 }	161 }

141	162

(...skipping 11 matching lines...) Expand all Loading...
153 type r struct {	174 type r struct {

154 bn string	175 bn string

155 b messages.Builders	176 b messages.Builders

156 alerts []messages.Alert	177 alerts []messages.Alert

157 err []error	178 err []error

158 }	179 }

159 c := make(chan r, len(be.Builders))	180 c := make(chan r, len(be.Builders))

160	181

161 // TODO: get a list of all the running builds from be.Slaves? It	182 // TODO: get a list of all the running builds from be.Slaves? It

162 // appears to be used later on in the original py.	183 // appears to be used later on in the original py.

	184 scannedBuilders := []string{}

163 for bn, b := range be.Builders {	185 for bn, b := range be.Builders {

	186 if a.BuilderOnly != "" && bn != a.BuilderOnly {

	187 continue

	188 }

	189 scannedBuilders = append(scannedBuilders, bn)

164 go func(bn string, b messages.Builders) {	190 go func(bn string, b messages.Builders) {

165 out := r{bn: bn, b: b}	191 out := r{bn: bn, b: b}

166 defer func() {	192 defer func() {

167 c <- out	193 c <- out

168 }()	194 }()

169	195

170 // This blocks on IO, hence the goroutine.	196 // This blocks on IO, hence the goroutine.

171 a.warmBuildCache(mn, bn, b.CachedBuilds)	197 a.warmBuildCache(mn, bn, b.CachedBuilds)

172

173 // Each call to builderAlerts may trigger blocking json fetches,	198 // Each call to builderAlerts may trigger blocking json fetches,

174 // but it has a data dependency on the above cache-warmi ng call, so	199 // but it has a data dependency on the above cache-warmi ng call, so

175 // the logic remains serial.	200 // the logic remains serial.

176 out.alerts, out.err = a.builderAlerts(mn, bn, &b)	201 out.alerts, out.err = a.builderAlerts(mn, bn, &b)

177 }(bn, b)	202 }(bn, b)

178 }	203 }

179	204

180 ret := []messages.Alert{}	205 ret := []messages.Alert{}

181 » for bn := range be.Builders {	206 » for _, bn := range scannedBuilders {

182 r := <-c	207 r := <-c

183 if len(r.err) != 0 {	208 if len(r.err) != 0 {

184 // TODO: add a special alert for this too?	209 // TODO: add a special alert for this too?

185 log.Errorf("Error getting alerts for builder %s: %v", bn , r.err)	210 log.Errorf("Error getting alerts for builder %s: %v", bn , r.err)

186 } else {	211 } else {

187 ret = append(ret, r.alerts...)	212 ret = append(ret, r.alerts...)

188 }	213 }

189 }	214 }

190	215

191 return ret	216 return ret

192 }	217 }

193	218

194 // masterName extracts the name of the master from the master's URL.	219 // masterName extracts the name of the master from the master's URL.

195 func masterName(URL string) (string, error) {	220 func masterName(URL string) (string, error) {

196 mURL, err := url.Parse(URL)	221 mURL, err := url.Parse(URL)

197 if err != nil {	222 if err != nil {

198 return "", err	223 return "", err

199 }	224 }

200 pathParts := strings.Split(mURL.Path, "/")	225 pathParts := strings.Split(mURL.Path, "/")

201 » return pathParts[len(pathParts)-1], nil	226 » return pathParts[len(pathParts)-2], nil
	Vadim Sh. 2015/05/06 21:40:18 check that len(pathParts) is >= 2. Also it would b check that len(pathParts) is >= 2. Also it would be a good place to put a comment about expected structure of the URL (I don't know why -2 exactly...) seanmccullough 2015/05/06 22:16:20 To lop the /json off the http://.../{master.name}/ Show quoted text On 2015/05/06 21:40:18, Vadim Sh. wrote: > check that len(pathParts) is >= 2. Also it would be a good place to put a > comment about expected structure of the URL (I don't know why -2 exactly...) To lop the /json off the http://.../{master.name}/json suffix. Added check and comment.
202 }	227 }

203	228

204 func cacheKeyForBuild(master, builder string, number int64) string {	229 func cacheKeyForBuild(master, builder string, number int64) string {

205 return filepath.FromSlash(	230 return filepath.FromSlash(

206 fmt.Sprintf("%s/%s/%d.json", url.QueryEscape(master), url.QueryE scape(builder), number))	231 fmt.Sprintf("%s/%s/%d.json", url.QueryEscape(master), url.QueryE scape(builder), number))

207 }	232 }

208	233

209 // TODO: actually write the on-disk cache.	234 // TODO: actually write the on-disk cache.

210 func filenameForCacheKey(cc string) string {	235 func filenameForCacheKey(cc string) string {

211 cc = strings.Replace(cc, "/", "_", -1)	236 cc = strings.Replace(cc, "/", "_", -1)

(...skipping 10 matching lines...) Expand all Loading...
222 v.Add("builder", builder)	247 v.Add("builder", builder)

223	248

224 URL := fmt.Sprintf("https://chrome-build-extract.appspot.com/get_builds? %s", v.Encode())	249 URL := fmt.Sprintf("https://chrome-build-extract.appspot.com/get_builds? %s", v.Encode())

225 res := struct {	250 res := struct {

226 Builds []messages.Builds `json:"builds"`	251 Builds []messages.Builds `json:"builds"`

227 }{}	252 }{}

228	253

229 // TODO: add FetchBuilds to the client interface. Take a list of {master , builder} and	254 // TODO: add FetchBuilds to the client interface. Take a list of {master , builder} and

230 // return (map[{master, builder}][]Builds, map [{master, builder}]error)	255 // return (map[{master, builder}][]Builds, map [{master, builder}]error)

231 // That way we can do all of these in parallel.	256 // That way we can do all of these in parallel.

	257

232 status, err := a.Client.JSON(URL, &res)	258 status, err := a.Client.JSON(URL, &res)

233 if err != nil {	259 if err != nil {

234 log.Errorf("Error (%d) fetching %s: %s", status, URL, err)	260 log.Errorf("Error (%d) fetching %s: %s", status, URL, err)

235 }	261 }

236	262

	263 a.bLock.Lock()

237 for _, b := range res.Builds {	264 for _, b := range res.Builds {

238 » » a.bCache[cacheKeyForBuild(master, builder, b.Number)] = &b	265 » » // TODO: consider making res.Builds be []*messages.Builds instea d of []messages.Builds

	266 » » ba := b
	Vadim Sh. 2015/05/06 21:40:18 IIUC, b is already a copy of whatever is in messag IIUC, b is already a copy of whatever is in messages.Builds so it should be safe to just use &b below (not 100% sure, worth conducting an experiment...) seanmccullough 2015/05/06 22:16:20 Taking the address of a range var always gives you Show quoted text On 2015/05/06 21:40:18, Vadim Sh. wrote: > IIUC, b is already a copy of whatever is in messages.Builds so it should be safe > to just use &b below (not 100% sure, worth conducting an experiment...) Taking the address of a range var always gives you the same address. I found this out the hard way :) https://play.golang.org/p/s7QAhTHx_2 Vadim Sh. 2015/05/06 22:30:23 :) Good to know. Go pointers + garbage collection Show quoted text On 2015/05/06 22:16:20, seanmccullough wrote: > On 2015/05/06 21:40:18, Vadim Sh. wrote: > > IIUC, b is already a copy of whatever is in messages.Builds so it should be > safe > > to just use &b below (not 100% sure, worth conducting an experiment...) > Taking the address of a range var always gives you the same address. > I found this out the hard way :) https://play.golang.org/p/s7QAhTHx_2 :) Good to know. Go pointers + garbage collection still look somewhat magical to me... E.g. in C++ I'd expect "ba" to be stack allocated and &ba have same address in each loop iteration, sort of like "b" behaves. seanmccullough 2015/05/06 22:43:13 Thankfully there aren't quite as many wtf moments Show quoted text On 2015/05/06 22:30:23, Vadim Sh. wrote: > On 2015/05/06 22:16:20, seanmccullough wrote: > > On 2015/05/06 21:40:18, Vadim Sh. wrote: > > > IIUC, b is already a copy of whatever is in messages.Builds so it should be > > safe > > > to just use &b below (not 100% sure, worth conducting an experiment...) > > Taking the address of a range var always gives you the same address. > > I found this out the hard way :) https://play.golang.org/p/s7QAhTHx_2 > > :) Good to know. Go pointers + garbage collection still look somewhat magical to > me... E.g. in C++ I'd expect "ba" to be stack allocated and &ba have same > address in each loop iteration, sort of like "b" behaves. Thankfully there aren't quite as many wtf moments with go pointers, but they do happen.
	267 » » a.bCache[cacheKeyForBuild(master, builder, b.Number)] = &ba

239 }	268 }

	269 a.bLock.Unlock()

240 }	270 }

241	271

242 // This type is used for sorting build IDs.	272 // This type is used for sorting build IDs.

243 type buildIDs []int64	273 type buildIDs []int64

244	274

245 func (a buildIDs) Len() int { return len(a) }	275 func (a buildIDs) Len() int { return len(a) }

246 func (a buildIDs) Swap(i, j int) { a[i], a[j] = a[j], a[i] }	276 func (a buildIDs) Swap(i, j int) { a[i], a[j] = a[j], a[i] }

247 func (a buildIDs) Less(i, j int) bool { return a[i] > a[j] }	277 func (a buildIDs) Less(i, j int) bool { return a[i] > a[j] }

248	278

249 // latestBuildStep returns the latest build step name and update time, and an er ror	279 // latestBuildStep returns the latest build step name and update time, and an er ror

(...skipping 33 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
283 recentBuildIDs = recentBuildIDs[:a.MaxRecentBuilds]	313 recentBuildIDs = recentBuildIDs[:a.MaxRecentBuilds]

284 }	314 }

285 if len(recentBuildIDs) == 0 {	315 if len(recentBuildIDs) == 0 {

286 // TODO: Make an alert for this?	316 // TODO: Make an alert for this?

287 log.Errorf("No recent builds for %s.%s", mn, bn)	317 log.Errorf("No recent builds for %s.%s", mn, bn)

288 return alerts, errs	318 return alerts, errs

289 }	319 }

290 log.Infof("Checking %d most recent builds for alertable step failures: % s/%s", len(recentBuildIDs), mn, bn)	320 log.Infof("Checking %d most recent builds for alertable step failures: % s/%s", len(recentBuildIDs), mn, bn)

291	321

292 // Check for alertable step failures.	322 // Check for alertable step failures.

293 » for _, buildID := range recentBuildIDs {	323 » stepFailureAlerts := map[string]messages.Alert{}

	324 » firstFailureInstance := map[string]int64{}

	325

	326 » for i, buildID := range recentBuildIDs {

294 failures, err := a.stepFailures(mn, bn, buildID)	327 failures, err := a.stepFailures(mn, bn, buildID)

295 if err != nil {	328 if err != nil {

296 errs = append(errs, err)	329 errs = append(errs, err)

297 }	330 }

	331 if len(failures) == 0 && i > a.MinRecentBuilds {

	332 // Bail as soon as we find a successful build prior to t he most recent couple of builds.

	333 break

	334 }

	335

298 as, err := a.stepFailureAlerts(failures)	336 as, err := a.stepFailureAlerts(failures)

299 if err != nil {	337 if err != nil {

300 errs = append(errs, err)	338 errs = append(errs, err)

301 }	339 }

302 » » alerts = append(alerts, as...)	340

	341 » » lastKey := ""

	342 » » // This probably isn't exactly what we want since multiple build ers may have the same alert firing.

	343 » » // This just merges build ranges on a per-builder basis.

	344 » » for _, alr := range as {

	345 » » » // Only keep the most recent alert. Since recentBuildID s is

	346 » » » // sorted newest build first, ignore older instances of the alert.

	347 » » » if _, ok := stepFailureAlerts[alr.Key]; !ok {

	348 » » » » stepFailureAlerts[alr.Key] = alr

	349 » » » }

	350 » » » if alr.Key == lastKey \|\| lastKey == "" {

	351 » » » » firstFailureInstance[alr.Key] = alr.Extension.(m essages.BuildFailure).Builders[0].FirstFailure
	Vadim Sh. 2015/05/06 21:40:18 This type assertion looks rather magical... Perhap This type assertion looks rather magical... Perhaps it would be better to be prepared to handle other types of extensions? e.g. if ext, ok := alr.Extension.(message.BuildFailure); ok { ... = ext.Buiders[0].FirstFailure } Or add a comment to stepFailureAlerts noting that it MUST return alerts with BuildFailure extension only. seanmccullough 2015/05/06 22:16:20 Added a check and log an error if it fails. Not su Show quoted text On 2015/05/06 21:40:18, Vadim Sh. wrote: > This type assertion looks rather magical... Perhaps it would be better to be > prepared to handle other types of extensions? e.g. > > if ext, ok := alr.Extension.(message.BuildFailure); ok { > ... = ext.Buiders[0].FirstFailure > } > > Or add a comment to stepFailureAlerts noting that it MUST return alerts with > BuildFailure extension only. Added a check and log an error if it fails. Not sure yet if this should handle non-BuildFailure extensions.
	352 » » » }

	353 » » » lastKey = alr.Key

	354 » » }

303 }	355 }

304	356

305 » // Check for stale builders. Latest build is the first in the list.	357 » for _, alr := range stepFailureAlerts {

	358 » » alr.Extension.(messages.BuildFailure).Builders[0].FirstFailure = firstFailureInstance[alr.Key]

	359 » » alerts = append(alerts, alr)

	360 » }

	361

	362 » // Check for stale/idle/offline builders. Latest build is the first in the list.

306 lastBuildID := recentBuildIDs[0]	363 lastBuildID := recentBuildIDs[0]

307 log.Infof("Checking last build ID: %d", lastBuildID)	364 log.Infof("Checking last build ID: %d", lastBuildID)

	365

308 // TODO: get this from cache.	366 // TODO: get this from cache.

309 » lastBuild, err := a.Client.Build(mn, bn, lastBuildID)	367 » var lastBuild *messages.Builds

310 » if err != nil {	368 » a.bLock.Lock()

311 » » errs = append(errs, fmt.Errorf("Couldn't get latest build %d for %s.%s: %s", lastBuildID, mn, bn, err))	369 » lastBuild = a.bCache[cacheKeyForBuild(mn, bn, lastBuildID)]
	Vadim Sh. 2015/05/06 21:40:18 := and remove var above := and remove var above seanmccullough 2015/05/06 22:16:20 Done. Show quoted text On 2015/05/06 21:40:18, Vadim Sh. wrote: > := and remove var above Done.
312 » » return alerts, errs	370 » a.bLock.Unlock()

	371 » if lastBuild == nil {

	372 » » var err error

	373 » » lastBuild, err = a.Client.Build(mn, bn, lastBuildID)

	374 » » if err != nil {

	375 » » » errs = append(errs, fmt.Errorf("Couldn't get latest buil d %d for %s.%s: %s", lastBuildID, mn, bn, err))

	376 » » » return alerts, errs

	377 » » }

313 }	378 }

314	379

315 // Examining only the latest build is probably suboptimal since if it's still in progress it might	380 // Examining only the latest build is probably suboptimal since if it's still in progress it might

316 // not have hit a step that is going to fail and has failed repeatedly f or the last few builds.	381 // not have hit a step that is going to fail and has failed repeatedly f or the last few builds.

317 // AKA "Reliable failures". TODO: Identify "Reliable failures"	382 // AKA "Reliable failures". TODO: Identify "Reliable failures"

318 lastStep, lastUpdated, err := a.latestBuildStep(lastBuild)	383 lastStep, lastUpdated, err := a.latestBuildStep(lastBuild)

319 if err != nil {	384 if err != nil {

320 errs = append(errs, fmt.Errorf("Couldn't get latest build step f or %s.%s: %v", mn, bn, err))	385 errs = append(errs, fmt.Errorf("Couldn't get latest build step f or %s.%s: %v", mn, bn, err))

321 return alerts, errs	386 return alerts, errs

322	387

323 }	388 }

324 elapsed := a.now().Sub(lastUpdated.Time())	389 elapsed := a.now().Sub(lastUpdated.Time())

325 links := []messages.Link{	390 links := []messages.Link{

326 {"Builder", fmt.Sprintf("https://build.chromium.org/p/%s/builder s/%s", mn, bn)},	391 {"Builder", fmt.Sprintf("https://build.chromium.org/p/%s/builder s/%s", mn, bn)},

327 {"Last build", fmt.Sprintf("https://build.chromium.org/p/%s/buil ders/%s/builds/%d", mn, bn, lastBuildID)},	392 {"Last build", fmt.Sprintf("https://build.chromium.org/p/%s/buil ders/%s/builds/%d", mn, bn, lastBuildID)},

328 {"Last build step", fmt.Sprintf("https://build.chromium.org/p/%s /builders/%s/builds/%d/steps/%s", mn, bn, lastBuildID, lastStep)},	393 {"Last build step", fmt.Sprintf("https://build.chromium.org/p/%s /builders/%s/builds/%d/steps/%s", mn, bn, lastBuildID, lastStep)},

329 }	394 }

330	395

331 switch b.State {	396 switch b.State {

332 case messages.StateBuilding:	397 case messages.StateBuilding:

333 if elapsed > a.HungBuilderThresh && lastStep != StepCompletedRun {	398 if elapsed > a.HungBuilderThresh && lastStep != StepCompletedRun {

334 alerts = append(alerts, messages.Alert{	399 alerts = append(alerts, messages.Alert{

335 Key: fmt.Sprintf("%s.%s.hung", mn, bn),	400 Key: fmt.Sprintf("%s.%s.hung", mn, bn),

336 Title: fmt.Sprintf("%s.%s is hung in step %s. ", mn, bn, lastStep),	401 Title: fmt.Sprintf("%s.%s is hung in step %s. ", mn, bn, lastStep),

337 Body: fmt.Sprintf("%s.%s has been building f or %v (last step update %s), past the alerting threshold of %v", mn, bn, elapsed , lastUpdated.Time(), a.HungBuilderThresh),	402 Body: fmt.Sprintf("%s.%s has been building f or %v (last step update %s), past the alerting threshold of %v", mn, bn, elapsed , lastUpdated.Time(), a.HungBuilderThresh),

338 » » » » Severity: 0,	403 » » » » Severity: hungBuilderSev,

339 Time: messages.TimeToEpochTime(a.now()),	404 Time: messages.TimeToEpochTime(a.now()),

340 Links: links,	405 Links: links,

341 })	406 })

342 // Note, just because it's building doesn't mean it's in a good state. If the last N builds	407 // Note, just because it's building doesn't mean it's in a good state. If the last N builds

343 // all failed (for some large N) then this might still b e alertable.	408 // all failed (for some large N) then this might still b e alertable.

344 }	409 }

345 case messages.StateOffline:	410 case messages.StateOffline:

346 if elapsed > a.OfflineBuilderThresh {	411 if elapsed > a.OfflineBuilderThresh {

347 alerts = append(alerts, messages.Alert{	412 alerts = append(alerts, messages.Alert{

348 Key: fmt.Sprintf("%s.%s.offline", mn, bn),	413 Key: fmt.Sprintf("%s.%s.offline", mn, bn),

349 Title: fmt.Sprintf("%s.%s is offline.", mn, b n),	414 Title: fmt.Sprintf("%s.%s is offline.", mn, b n),

350 » » » » Body: fmt.Sprintf("%s.%s has been offline fo r %v (last step update %s), past the alerting threshold of %v", mn, bn, elapsed, lastUpdated.Time(), a.OfflineBuilderThresh),	415 » » » » Body: fmt.Sprintf("%s.%s has been offline fo r %v (last step update %s %v), past the alerting threshold of %v", mn, bn, elaps ed, lastUpdated.Time(), float64(lastUpdated), a.OfflineBuilderThresh),

351 » » » » Severity: 0,	416 » » » » Severity: offlineBuilderSev,

352 Time: messages.TimeToEpochTime(a.now()),	417 Time: messages.TimeToEpochTime(a.now()),

353 Links: links,	418 Links: links,

354 })	419 })

355 }	420 }

356 case messages.StateIdle:	421 case messages.StateIdle:

357 if b.PendingBuilds > a.IdleBuilderCountThresh {	422 if b.PendingBuilds > a.IdleBuilderCountThresh {

358 alerts = append(alerts, messages.Alert{	423 alerts = append(alerts, messages.Alert{

359 Key: fmt.Sprintf("%s.%s.idle", mn, bn),	424 Key: fmt.Sprintf("%s.%s.idle", mn, bn),

360 Title: fmt.Sprintf("%s.%s is idle with too ma ny pending builds.", mn, bn),	425 Title: fmt.Sprintf("%s.%s is idle with too ma ny pending builds.", mn, bn),

361 Body: fmt.Sprintf("%s.%s is idle with %d pen ding builds, past the alerting threshold of %d", mn, bn, b.PendingBuilds, a.Idle BuilderCountThresh),	426 Body: fmt.Sprintf("%s.%s is idle with %d pen ding builds, past the alerting threshold of %d", mn, bn, b.PendingBuilds, a.Idle BuilderCountThresh),

362 » » » » Severity: 0,	427 » » » » Severity: idleBuilderSev,

363 Time: messages.TimeToEpochTime(a.now()),	428 Time: messages.TimeToEpochTime(a.now()),

364 Links: links,	429 Links: links,

365 })	430 })

366 }	431 }

367 default:	432 default:

368 log.Errorf("Unknown %s.%s builder state: %s", mn, bn, b.State)	433 log.Errorf("Unknown %s.%s builder state: %s", mn, bn, b.State)

369 }	434 }

370	435

371 return alerts, errs	436 return alerts, errs

372 }	437 }

373	438

374 // stepFailures returns the steps that have failed recently on builder bn.	439 // stepFailures returns the steps that have failed recently on builder bn.

375 func (a *MasterAnalyzer) stepFailures(mn string, bn string, bID int64) ([]stepFa ilure, error) {	440 func (a *MasterAnalyzer) stepFailures(mn string, bn string, bID int64) ([]stepFa ilure, error) {

376 cc := cacheKeyForBuild(mn, bn, bID)	441 cc := cacheKeyForBuild(mn, bn, bID)

377	442

378 var err error // To avoid re-scoping b in the nested conditional below w ith a :=.	443 var err error // To avoid re-scoping b in the nested conditional below w ith a :=.

	444 a.bLock.Lock()

379 b, ok := a.bCache[cc]	445 b, ok := a.bCache[cc]

	446 a.bLock.Unlock()

380 if !ok {	447 if !ok {

381 log.Infof("Cache miss for %s", cc)	448 log.Infof("Cache miss for %s", cc)

382 b, err = a.Client.Build(mn, bn, bID)	449 b, err = a.Client.Build(mn, bn, bID)

383 if err != nil \|\| b == nil {	450 if err != nil \|\| b == nil {

384 log.Errorf("Error fetching build: %v", err)	451 log.Errorf("Error fetching build: %v", err)

385 return nil, err	452 return nil, err

386 }	453 }

387 }	454 }

388	455

389 ret := []stepFailure{}	456 ret := []stepFailure{}

	457

390 for _, s := range b.Steps {	458 for _, s := range b.Steps {

391 if !s.IsFinished \|\| len(s.Results) == 0 {	459 if !s.IsFinished \|\| len(s.Results) == 0 {

392 continue	460 continue

393 }	461 }

394 // Because Results in the json data is a homogeneous array, the unmarshaler	462 // Because Results in the json data is a homogeneous array, the unmarshaler

395 // doesn't have any type information to assert about it. We have to do	463 // doesn't have any type information to assert about it. We have to do

396 // some ugly runtime type assertion ourselves.	464 // some ugly runtime type assertion ourselves.

397 if r, ok := s.Results[0].(float64); ok {	465 if r, ok := s.Results[0].(float64); ok {

398 if r == 0 \|\| r == 1 {	466 if r == 0 \|\| r == 1 {

399 // This 0/1 check seems to be a convention or he uristic. A 0 or 1	467 // This 0/1 check seems to be a convention or he uristic. A 0 or 1

(...skipping 22 matching lines...) Expand all Loading...
422 ret := []messages.Alert{}	490 ret := []messages.Alert{}

423 type res struct {	491 type res struct {

424 f stepFailure	492 f stepFailure

425 a *messages.Alert	493 a *messages.Alert

426 err error	494 err error

427 }	495 }

428	496

429 // Might not need full capacity buffer, since some failures are ignored below.	497 // Might not need full capacity buffer, since some failures are ignored below.

430 rs := make(chan res, len(failures))	498 rs := make(chan res, len(failures))

431	499

	500 scannedFailures := []stepFailure{}

432 for _, f := range failures {	501 for _, f := range failures {

433 // goroutine/channel because the reasonsForFailure call potentia lly	502 // goroutine/channel because the reasonsForFailure call potentia lly

434 // blocks on IO.	503 // blocks on IO.

	504 if f.step.Name == "steps" {

	505 continue

	506 // The actual breaking step will appear later.

	507 }

	508 scannedFailures = append(scannedFailures, f)

435 go func(f stepFailure) {	509 go func(f stepFailure) {

436 alr := messages.Alert{	510 alr := messages.Alert{

437 Title: fmt.Sprintf("Builder step failure: %s.%s" , f.masterName, f.builderName),	511 Title: fmt.Sprintf("Builder step failure: %s.%s" , f.masterName, f.builderName),

438 Time: messages.EpochTime(a.now().Unix()),	512 Time: messages.EpochTime(a.now().Unix()),

439 Type: "buildfailure",	513 Type: "buildfailure",

440 }	514 }

441	515

	516 // If the builder has been failing on the same step for multiple builds in a row,

	517 // we should have only one alert but indicate the range of builds affected.

	518 // These are set in FirstFailure and LastFailure.

442 bf := messages.BuildFailure{	519 bf := messages.BuildFailure{

443 // FIXME: group builders?	520 // FIXME: group builders?

444 Builders: []messages.AlertedBuilder{	521 Builders: []messages.AlertedBuilder{

445 {	522 {

446 Name: f.builderName,	523 Name: f.builderName,

447 » » » » » » URL: f.URL(),	524 » » » » » » URL: fmt.Sprintf("http s://build.chromium.org/p/%s/builders/%s", f.masterName, f.builderName),

448 » » » » » » FirstFailure: 0,	525 » » » » » » FirstFailure: f.build.Number,

449 » » » » » » LatestFailure: 1,	526 » » » » » » LatestFailure: f.build.Number,

450 },	527 },

451 },	528 },

452 // TODO: RegressionRanges:	529 // TODO: RegressionRanges:

453 // look into Builds.SourceStamp.Changes.	530 // look into Builds.SourceStamp.Changes.

454 }	531 }

455	532

456 reasons := a.reasonsForFailure(f)	533 reasons := a.reasonsForFailure(f)

457 for _, r := range reasons {	534 for _, r := range reasons {

458 bf.Reasons = append(bf.Reasons, messages.Reason{	535 bf.Reasons = append(bf.Reasons, messages.Reason{

459 TestName: r,	536 TestName: r,

460 Step: f.step.Name,	537 Step: f.step.Name,

	538 URL: f.URL(),

461 })	539 })

462 }	540 }

463	541

464 alr.Extension = bf	542 alr.Extension = bf

465 if len(bf.Reasons) == 0 {	543 if len(bf.Reasons) == 0 {

466 » » » » log.Warnf("No reasons for step failure: %s", ale rtKey(f.masterName, f.builderName, f.step.Name, ""))	544 » » » » alr.Key = alertKey(f.masterName, f.builderName, f.step.Name, "")

467 » » » » rs <- res{	545 » » » » log.Warnf("No reasons for step failure: %s", alr .Key)

468 » » » » » f: f,

469 » » » » }

470 } else {	546 } else {

471 // Should the key include all of the reasons?	547 // Should the key include all of the reasons?

472 alr.Key = alertKey(f.masterName, f.builderName, f.step.Name, reasons[0])	548 alr.Key = alertKey(f.masterName, f.builderName, f.step.Name, reasons[0])

473	549 » » » }

474 » » » » rs <- res{	550 » » » rs <- res{

475 » » » » » f: f,	551 » » » » f: f,

476 » » » » » a: &alr,	552 » » » » a: &alr,

477 » » » » » err: nil,	553 » » » » err: nil,

478 » » » » }

479 }	554 }

480 }(f)	555 }(f)

481 }	556 }

482	557

483 » for _ = range failures {	558 » for _ = range scannedFailures {

484 r := <-rs	559 r := <-rs

485 if r.a != nil {	560 if r.a != nil {

486 ret = append(ret, *r.a)	561 ret = append(ret, *r.a)

487 }	562 }

488 }	563 }

489	564

490 return ret, nil	565 return ret, nil

491 }	566 }

492	567

493 // reasonsForFailure examines the step failure and applies some heuristics to	568 // reasonsForFailure examines the step failure and applies some heuristics to

494 // to find the cause. It may make blocking IO calls in the process.	569 // to find the cause. It may make blocking IO calls in the process.

495 func (a *MasterAnalyzer) reasonsForFailure(f stepFailure) []string {	570 func (a *MasterAnalyzer) reasonsForFailure(f stepFailure) []string {

496 ret := []string{}	571 ret := []string{}

497 recognized := false	572 recognized := false

498 log.Infof("Checking for reasons for failure step: %v", f.step.Name)	573 log.Infof("Checking for reasons for failure step: %v", f.step.Name)

499 for _, sfa := range a.StepAnalyzers {	574 for _, sfa := range a.StepAnalyzers {

500 res, err := sfa.Analyze(f)	575 res, err := sfa.Analyze(f)

501 if err != nil {	576 if err != nil {

	577 // TODO: return something that contains errors and rea sons.

502 log.Errorf("Error get reasons from StepAnalyzer: %v", er r)	578 log.Errorf("Error get reasons from StepAnalyzer: %v", er r)

	579 continue

503 }	580 }

504 if res.Recognized {	581 if res.Recognized {

505 recognized = true	582 recognized = true

506 ret = append(ret, res.Reasons...)	583 ret = append(ret, res.Reasons...)

507 }	584 }

508 }	585 }

509	586

510 if !recognized {	587 if !recognized {

511 // TODO: log and report frequently encountered unrecognized buil der step	588 // TODO: log and report frequently encountered unrecognized buil der step

512 // failure names.	589 // failure names.

(...skipping 39 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
552 // Sigh. build.chromium.org doesn't accept + as an escaped space in URL paths.	629 // Sigh. build.chromium.org doesn't accept + as an escaped space in URL paths.

553 func oldEscape(s string) string {	630 func oldEscape(s string) string {

554 return strings.Replace(url.QueryEscape(s), "+", "%20", -1)	631 return strings.Replace(url.QueryEscape(s), "+", "%20", -1)

555 }	632 }

556	633

557 // URL returns a url to builder step failure page.	634 // URL returns a url to builder step failure page.

558 func (f stepFailure) URL() string {	635 func (f stepFailure) URL() string {

559 return fmt.Sprintf("https://build.chromium.org/p/%s/builders/%s/builds/% d/steps/%s",	636 return fmt.Sprintf("https://build.chromium.org/p/%s/builders/%s/builds/% d/steps/%s",

560 f.masterName, oldEscape(f.builderName), f.build.Number, oldEscap e(f.step.Name))	637 f.masterName, oldEscape(f.builderName), f.build.Number, oldEscap e(f.step.Name))

561 }	638 }

OLD	NEW

« no previous file with comments | « no previous file | go/src/infra/monitoring/analyzer/analyzer_test.go » ('j') | go/src/infra/monitoring/messages/alerts.go » ('J')