OLD | NEW |
1 package alerting | 1 package alerting |
2 | 2 |
3 import ( | 3 import ( |
4 "database/sql" | 4 "database/sql" |
5 "encoding/json" | 5 "encoding/json" |
6 "fmt" | 6 "fmt" |
7 "math" | 7 "math" |
8 "time" | 8 "time" |
9 | 9 |
10 metrics "github.com/rcrowley/go-metrics" | 10 metrics "github.com/rcrowley/go-metrics" |
11 "github.com/skia-dev/glog" | 11 "github.com/skia-dev/glog" |
12 | 12 |
13 "go.skia.org/infra/go/issues" | 13 "go.skia.org/infra/go/issues" |
14 "go.skia.org/infra/go/metadata" | 14 "go.skia.org/infra/go/metadata" |
15 » "go.skia.org/infra/go/tiling" | 15 » tracedb "go.skia.org/infra/go/trace/db" |
16 "go.skia.org/infra/go/util" | 16 "go.skia.org/infra/go/util" |
17 "go.skia.org/infra/perf/go/clustering" | 17 "go.skia.org/infra/perf/go/clustering" |
18 "go.skia.org/infra/perf/go/config" | 18 "go.skia.org/infra/perf/go/config" |
19 "go.skia.org/infra/perf/go/db" | 19 "go.skia.org/infra/perf/go/db" |
20 "go.skia.org/infra/perf/go/types" | 20 "go.skia.org/infra/perf/go/types" |
21 ) | 21 ) |
22 | 22 |
23 const ( | 23 const ( |
24 CLUSTER_SIZE = 50 | 24 CLUSTER_SIZE = 50 |
25 CLUSTER_STDDEV = 0.001 | 25 CLUSTER_STDDEV = 0.001 |
26 | 26 |
27 // TRACKED_ITEM_URL_TEMPLATE is used to generate the URL that is | 27 // TRACKED_ITEM_URL_TEMPLATE is used to generate the URL that is |
28 // embedded in an issue. It is also used to search for issues linked to
a | 28 // embedded in an issue. It is also used to search for issues linked to
a |
29 // specific item (cluster). The format verb is to be replaced with the I
D | 29 // specific item (cluster). The format verb is to be replaced with the I
D |
30 // of the tracked item. | 30 // of the tracked item. |
31 TRACKED_ITEM_URL_TEMPLATE = "https://perf.skia.org/cl/%d" | 31 TRACKED_ITEM_URL_TEMPLATE = "https://perf.skia.org/cl/%d" |
32 ) | 32 ) |
33 | 33 |
34 var ( | 34 var ( |
35 // The number of clusters with a status of "New". | 35 // The number of clusters with a status of "New". |
36 newClustersGauge = metrics.NewRegisteredGauge("alerting.new", metrics.De
faultRegistry) | 36 newClustersGauge = metrics.NewRegisteredGauge("alerting.new", metrics.De
faultRegistry) |
37 | 37 |
38 // The number of times we've successfully done alert clustering. | 38 // The number of times we've successfully done alert clustering. |
39 runsCounter = metrics.NewRegisteredCounter("alerting.runs", metrics.Defa
ultRegistry) | 39 runsCounter = metrics.NewRegisteredCounter("alerting.runs", metrics.Defa
ultRegistry) |
40 | 40 |
41 // How long it takes to do a clustering run. | 41 // How long it takes to do a clustering run. |
42 alertingLatency = metrics.NewRegisteredTimer("alerting.latency", metrics
.DefaultRegistry) | 42 alertingLatency = metrics.NewRegisteredTimer("alerting.latency", metrics
.DefaultRegistry) |
43 | 43 |
44 » // tileStore is the TileStore we are alerting on. | 44 » // tileBuilder is the tracedb.Builder where we load Tiles from. |
45 » tileStore tiling.TileStore | 45 » tileBuilder *tracedb.Builder |
46 ) | 46 ) |
47 | 47 |
48 // CombineClusters combines freshly found clusters with existing clusters. | 48 // CombineClusters combines freshly found clusters with existing clusters. |
49 // | 49 // |
50 // Algorithm: | 50 // Algorithm: |
51 // Run clustering and pick out the "Interesting" clusters. | 51 // Run clustering and pick out the "Interesting" clusters. |
52 // Compare all the Interesting clusters to all the existing relevant clusters
, | 52 // Compare all the Interesting clusters to all the existing relevant clusters
, |
53 // where "relevant" clusters are ones whose Hash/timestamp of the step | 53 // where "relevant" clusters are ones whose Hash/timestamp of the step |
54 // exists in the current tile. | 54 // exists in the current tile. |
55 // Start with an empty "list". | 55 // Start with an empty "list". |
(...skipping 209 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
265 if !util.Int64Equal(bugs, c.Bugs) { | 265 if !util.Int64Equal(bugs, c.Bugs) { |
266 c.Bugs = bugs | 266 c.Bugs = bugs |
267 if err := Write(c); err != nil { | 267 if err := Write(c); err != nil { |
268 return fmt.Errorf("Alerting: Failed to write updated clu
ster with bugs: %s", err) | 268 return fmt.Errorf("Alerting: Failed to write updated clu
ster with bugs: %s", err) |
269 } | 269 } |
270 } | 270 } |
271 return nil | 271 return nil |
272 } | 272 } |
273 | 273 |
274 // singleStep does a single round of alerting. | 274 // singleStep does a single round of alerting. |
275 func singleStep(tileStore tiling.TileStore, issueTracker issues.IssueTracker) { | 275 func singleStep(issueTracker issues.IssueTracker) { |
276 latencyBegin := time.Now() | 276 latencyBegin := time.Now() |
277 » tile, err := tileStore.Get(0, -1) | 277 » tile := tileBuilder.GetTile() |
278 » if err != nil { | |
279 » » glog.Errorf("Alerting: Failed to get tile: %s", err) | |
280 » » return | |
281 » } | |
282 | |
283 summary, err := clustering.CalculateClusterSummaries(tile, CLUSTER_SIZE,
CLUSTER_STDDEV, skpOnly) | 278 summary, err := clustering.CalculateClusterSummaries(tile, CLUSTER_SIZE,
CLUSTER_STDDEV, skpOnly) |
284 if err != nil { | 279 if err != nil { |
285 glog.Errorf("Alerting: Failed to calculate clusters: %s", err) | 280 glog.Errorf("Alerting: Failed to calculate clusters: %s", err) |
286 return | 281 return |
287 } | 282 } |
288 fresh := []*types.ClusterSummary{} | 283 fresh := []*types.ClusterSummary{} |
289 for _, c := range summary.Clusters { | 284 for _, c := range summary.Clusters { |
290 if math.Abs(c.StepFit.Regression) > clustering.INTERESTING_THRES
HHOLD { | 285 if math.Abs(c.StepFit.Regression) > clustering.INTERESTING_THRES
HHOLD { |
291 fresh = append(fresh, c) | 286 fresh = append(fresh, c) |
292 } | 287 } |
(...skipping 38 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
331 } | 326 } |
332 } | 327 } |
333 newClustersGauge.Update(int64(count)) | 328 newClustersGauge.Update(int64(count)) |
334 runsCounter.Inc(1) | 329 runsCounter.Inc(1) |
335 alertingLatency.UpdateSince(latencyBegin) | 330 alertingLatency.UpdateSince(latencyBegin) |
336 } | 331 } |
337 | 332 |
338 // calcNewClusters counts how many clusters are "New" and updates | 333 // calcNewClusters counts how many clusters are "New" and updates |
339 // the newClusterGauge metric accordingly. | 334 // the newClusterGauge metric accordingly. |
340 func calcNewClusters() { | 335 func calcNewClusters() { |
341 » tile, err := tileStore.Get(0, -1) | 336 » tile := tileBuilder.GetTile() |
342 » if err != nil { | |
343 » » glog.Errorf("Alerting: Failed to get tile: %s", err) | |
344 » » return | |
345 » } | |
346 current, err := ListFrom(tile.Commits[0].CommitTime) | 337 current, err := ListFrom(tile.Commits[0].CommitTime) |
347 if err != nil { | 338 if err != nil { |
348 glog.Errorf("Alerting: Failed to get existing clusters: %s", err
) | 339 glog.Errorf("Alerting: Failed to get existing clusters: %s", err
) |
349 return | 340 return |
350 } | 341 } |
351 count := 0 | 342 count := 0 |
352 for _, c := range current { | 343 for _, c := range current { |
353 if c.Status == "New" { | 344 if c.Status == "New" { |
354 count++ | 345 count++ |
355 } | 346 } |
356 } | 347 } |
357 glog.Infof("Updated new cluster count: %d", count) | 348 glog.Infof("Updated new cluster count: %d", count) |
358 newClustersGauge.Update(int64(count)) | 349 newClustersGauge.Update(int64(count)) |
359 } | 350 } |
360 | 351 |
361 // Start kicks off a go routine the periodically refreshes the current alerting
clusters. | 352 // Start kicks off a go routine the periodically refreshes the current alerting
clusters. |
362 func Start(ts tiling.TileStore, apiKeyFlag string) { | 353 func Start(tb *tracedb.Builder, apiKeyFlag string) { |
| 354 » tileBuilder = tb |
363 apiKey := apiKeyFromFlag(apiKeyFlag) | 355 apiKey := apiKeyFromFlag(apiKeyFlag) |
364 var issueTracker issues.IssueTracker = nil | 356 var issueTracker issues.IssueTracker = nil |
365 if apiKey != "" { | 357 if apiKey != "" { |
366 issueTracker = issues.NewIssueTracker(apiKey) | 358 issueTracker = issues.NewIssueTracker(apiKey) |
367 } | 359 } |
368 | 360 |
369 tileStore = ts | |
370 go func() { | 361 go func() { |
371 for _ = range time.Tick(config.RECLUSTER_DURATION) { | 362 for _ = range time.Tick(config.RECLUSTER_DURATION) { |
372 » » » singleStep(ts, issueTracker) | 363 » » » singleStep(issueTracker) |
373 } | 364 } |
374 }() | 365 }() |
375 } | 366 } |
OLD | NEW |