OLD | NEW |
1 # This file defines alerts to be triggered by the server. | 1 # This file defines alerts to be triggered by the server. |
2 | 2 |
3 # | 3 # |
4 # SkiaPerf and SkiaGold | 4 # SkiaPerf and SkiaGold |
5 # | 5 # |
6 | 6 |
7 [[rule]] | 7 [[rule]] |
8 name = "Perf Alerts" | 8 name = "Perf Alerts" |
9 message = "At least one perf alert has been found. Please visit https://perf.ski
a.org/alerts/ to triage." | 9 message = "At least one perf alert has been found. Please visit https://perf.ski
a.org/alerts/ to triage." |
10 query = "select value from /skiaperf.skia-perf.alerting.new.value/ limit 1" | 10 query = "select value from /skiaperf.skia-perf.alerting.new.value/ limit 1" |
11 category = "Perf" | 11 category = "Perf" |
12 condition = "x > 0" | 12 condition = "x > 0" |
13 actions = ["Email(alerts@skia.org)"] | 13 actions = ["Email(alerts@skia.org)"] |
14 auto-dismiss = true | 14 auto-dismiss = true |
15 nag = "24h" | 15 nag = "24h" |
16 | 16 |
17 [[rule]] | 17 [[rule]] |
18 name = "Gold Alert (GM)" | 18 name = "Gold Alert (GM)" |
19 message = "At least one untriaged GM has been found. Please visit https://gold.s
kia.org/ to triage." | 19 message = "At least one untriaged GM has been found. Please visit https://gold.s
kia.org/ to triage." |
20 query = "select value from /skiacorrectness.skia-gold-prod.status.untriaged.by_c
orpus.gm.value/ limit 1" | 20 query = "select value from /^skiacorrectness.skia-gold-prod.status.untriaged.by_
corpus.gm.value$/ limit 1" |
21 category = "Gold" | 21 category = "Gold" |
22 condition = "x > 0" | 22 condition = "x > 0" |
23 actions = ["Email(alerts@skia.org)"] | 23 actions = ["Email(alerts@skia.org)"] |
24 auto-dismiss = true | 24 auto-dismiss = true |
25 nag = "24h" | 25 nag = "24h" |
26 | 26 |
27 [[rule]] | 27 [[rule]] |
28 name = "Expired Ingores (Gold)" | 28 name = "Expired Ingores (Gold)" |
29 message = "At least one expired ignore rule has been found. Please visit https:/
/gold.skia.org/2/ignores to delete or extend." | 29 message = "At least one expired ignore rule has been found. Please visit https:/
/gold.skia.org/ignores to delete or extend." |
30 query = "select value from /skiacorrectness.skia-gold-prod.num-expired-ignore-ru
les.value/ limit 1" | 30 query = "select value from /^skiacorrectness.skia-gold-prod.num-expired-ignore-r
ules.value$/ limit 1" |
31 category = "Gold" | 31 category = "Gold" |
32 condition = "x > 0" | 32 condition = "x > 0" |
33 actions = ["Email(alerts@skia.org)"] | 33 actions = ["Email(alerts@skia.org)"] |
34 auto-dismiss = true | 34 auto-dismiss = true |
35 nag = "24h" | 35 nag = "24h" |
36 | 36 |
37 [[rule]] | 37 [[rule]] |
38 name = "Ingestion Failure (Perf)" | 38 name = "Ingestion Failure (Perf)" |
39 message = "At least two rounds of perf ingestion have failed back to back." | 39 message = "At least two rounds of perf ingestion have failed back to back." |
40 query = "select mean(value) from /ingest.skia-perf.ingester.nano-ingest.gauge.ti
me-since-last-successful-update.value/ where time > now() - 10m" | 40 query = "select mean(value) from /ingest.skia-perf.ingester.nano-ingest.gauge.ti
me-since-last-successful-update.value/ where time > now() - 10m" |
(...skipping 19 matching lines...) Expand all Loading... |
60 query = "select mean(value) from /ingest.skia-gold-prod.ingester.gold-ingest.gau
ge.time-since-last-successful-update.value/ where time > now() - 10m" | 60 query = "select mean(value) from /ingest.skia-gold-prod.ingester.gold-ingest.gau
ge.time-since-last-successful-update.value/ where time > now() - 10m" |
61 category = "infra" | 61 category = "infra" |
62 condition = "x >= 750" | 62 condition = "x >= 750" |
63 actions = ["Email(infra-alerts@skia.org)"] | 63 actions = ["Email(infra-alerts@skia.org)"] |
64 auto-dismiss = true | 64 auto-dismiss = true |
65 nag = "1h" | 65 nag = "1h" |
66 | 66 |
67 [[rule]] | 67 [[rule]] |
68 name = "Ingore Monitoring Failure (Gold)" | 68 name = "Ingore Monitoring Failure (Gold)" |
69 message = "At least two rounds of monitoring for expired ignore rules have faile
d back to back." | 69 message = "At least two rounds of monitoring for expired ignore rules have faile
d back to back." |
70 query = "select mean(value) from /skiacorrectness.skia-gold-prod.expired-ignore-
rules-monitoring.time-since-last-successful-update.value/ where time > now() - 1
0m" | 70 query = "select mean(value) from /^skiacorrectness.skia-gold-prod.expired-ignore
-rules-monitoring.time-since-last-successful-update.value$/ where time > now() -
10m" |
71 category = "infra" | 71 category = "infra" |
72 condition = "x >= 200" | 72 condition = "x >= 200" |
73 actions = ["Email(infra-alerts@skia.org)"] | 73 actions = ["Email(infra-alerts@skia.org)"] |
74 auto-dismiss = true | 74 auto-dismiss = true |
75 nag = "1h" | 75 nag = "1h" |
76 | 76 |
| 77 [[rule]] |
| 78 name = "Gold Hash Prober" |
| 79 message = "The list of currently considered image digests is not accessible at h
ttps://gold.skia.org/_/hashes" |
| 80 query = "select mean(value) from /^prober.skiagold_hashes.failure.value$/ where
time > now() - 10m;" |
| 81 category = "infra" |
| 82 condition = "x >= 1" |
| 83 actions = ["Email(infra-alerts@skia.org)"] |
| 84 auto-dismiss = false |
| 85 nag = "1h" |
| 86 |
77 # | 87 # |
78 # SkFiddle | 88 # SkFiddle |
79 # | 89 # |
80 | 90 |
81 [[rule]] | 91 [[rule]] |
82 name = "Skia Fiddle Prober (main page)" | 92 name = "Skia Fiddle Prober (main page)" |
83 message = "The main page at http://skfiddle.com has failed." | 93 message = "The main page at http://skfiddle.com has failed." |
84 query = "select mean(value) from /prober.skfiddle.failure.value/ where time > no
w() - 10m;" | 94 query = "select mean(value) from /prober.skfiddle.failure.value/ where time > no
w() - 10m;" |
85 category = "infra" | 95 category = "infra" |
86 condition = "x >= 1" | 96 condition = "x >= 1" |
(...skipping 284 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
371 | 381 |
372 [[rule]] | 382 [[rule]] |
373 name = "Probe Failure (issue-tracker)" | 383 name = "Probe Failure (issue-tracker)" |
374 message = "Ingesting issue tracker issued has failed to run in at least 30 minut
es." | 384 message = "Ingesting issue tracker issued has failed to run in at least 30 minut
es." |
375 query = "select mean(value) from /probeserver.skia-monitoring.issue-tracker.time
-since-last-successful-update.value/ where time > now() - 10m" | 385 query = "select mean(value) from /probeserver.skia-monitoring.issue-tracker.time
-since-last-successful-update.value/ where time > now() - 10m" |
376 category = "infra" | 386 category = "infra" |
377 condition = "x >= 1800" | 387 condition = "x >= 1800" |
378 actions = ["Email(infra-alerts@skia.org)"] | 388 actions = ["Email(infra-alerts@skia.org)"] |
379 auto-dismiss = true | 389 auto-dismiss = true |
380 nag = "1h" | 390 nag = "1h" |
OLD | NEW |