| Index: alertserver/alerts.cfg
|
| diff --git a/alertserver/alerts.cfg b/alertserver/alerts.cfg
|
| index 80e4f36906c269a58bce05a30bba7cad90f3fdbd..0608f6d910d2d4fd3a28f4f3e84731c372595153 100644
|
| --- a/alertserver/alerts.cfg
|
| +++ b/alertserver/alerts.cfg
|
| @@ -389,6 +389,16 @@ actions = ["Email(infra-alerts@skia.org)"]
|
| auto-dismiss = true
|
| nag = "1h"
|
|
|
| +[[rule]]
|
| +name = "Low Disk Space (skia-ctfe)"
|
| +message = "Free space has fallen below 1GB on skia-ctfe (root)."
|
| +query = "select mean(value) from /collectd.skia-ctfe.df-root.df_complex-free/ where time > now() - 5m;"
|
| +category = "infra"
|
| +condition = "x <= 1e9"
|
| +actions = ["Email(infra-alerts@skia.org)"]
|
| +auto-dismiss = true
|
| +nag = "1h"
|
| +
|
| #
|
| # Skia Status
|
| #
|
| @@ -524,3 +534,317 @@ condition = "x >= 600"
|
| actions = ["Email(infra-alerts@skia.org)"]
|
| auto-dismiss = true
|
| nag = "1h"
|
| +
|
| +#
|
| +# CTFE
|
| +#
|
| +
|
| +[[rule]]
|
| +name = "CTFE Staging Prober (main page)"
|
| +message = "The main page at http://ct-staging.skia.org is unavailable."
|
| +query = "select mean(value) from /prober.ctfe_staging.failure.value/ where time > now() - 10m;"
|
| +category = "infra"
|
| +condition = "x >= 1"
|
| +actions = ["Email(infra-alerts@skia.org)"]
|
| +auto-dismiss = true
|
| +nag = "1h"
|
| +
|
| +[[rule]]
|
| +name = "CTFE Staging Prober (chromium_perf)"
|
| +message = "The page at https://ct-staging.skia.org/chromium_perf/ is unavailable."
|
| +query = "select mean(value) from /prober.ctfe_staging_chromium_perf.failure.value/ where time > now() - 10m;"
|
| +category = "infra"
|
| +condition = "x >= 1"
|
| +actions = ["Email(infra-alerts@skia.org)"]
|
| +auto-dismiss = true
|
| +nag = "1h"
|
| +
|
| +[[rule]]
|
| +name = "CTFE Staging Prober (capture_skps)"
|
| +message = "The page at https://ct-staging.skia.org/capture_skps/ is unavailable."
|
| +query = "select mean(value) from /prober.ctfe_staging_capture_skps.failure.value/ where time > now() - 10m;"
|
| +category = "infra"
|
| +condition = "x >= 1"
|
| +actions = ["Email(infra-alerts@skia.org)"]
|
| +auto-dismiss = true
|
| +nag = "1h"
|
| +
|
| +[[rule]]
|
| +name = "CTFE Staging Prober (lua_script)"
|
| +message = "The page at https://ct-staging.skia.org/lua_script/ is unavailable."
|
| +query = "select mean(value) from /prober.ctfe_staging_lua_script.failure.value/ where time > now() - 10m;"
|
| +category = "infra"
|
| +condition = "x >= 1"
|
| +actions = ["Email(infra-alerts@skia.org)"]
|
| +auto-dismiss = true
|
| +nag = "1h"
|
| +
|
| +[[rule]]
|
| +name = "CTFE Staging Prober (chromium_builds)"
|
| +message = "The page at https://ct-staging.skia.org/chromium_builds/ is unavailable."
|
| +query = "select mean(value) from /prober.ctfe_staging_chromium_builds.failure.value/ where time > now() - 10m;"
|
| +category = "infra"
|
| +condition = "x >= 1"
|
| +actions = ["Email(infra-alerts@skia.org)"]
|
| +auto-dismiss = true
|
| +nag = "1h"
|
| +
|
| +[[rule]]
|
| +name = "CTFE Staging Prober (admin_tasks)"
|
| +message = "The page at https://ct-staging.skia.org/admin_tasks/ is unavailable."
|
| +query = "select mean(value) from /prober.ctfe_staging_admin_tasks.failure.value/ where time > now() - 10m;"
|
| +category = "infra"
|
| +condition = "x >= 1"
|
| +actions = ["Email(infra-alerts@skia.org)"]
|
| +auto-dismiss = true
|
| +nag = "1h"
|
| +
|
| +[[rule]]
|
| +name = "CTFE Staging Prober (queue)"
|
| +message = "The page at https://ct-staging.skia.org/queue/ is unavailable."
|
| +query = "select mean(value) from /prober.ctfe_staging_queue.failure.value/ where time > now() - 10m;"
|
| +category = "infra"
|
| +condition = "x >= 1"
|
| +actions = ["Email(infra-alerts@skia.org)"]
|
| +auto-dismiss = true
|
| +nag = "1h"
|
| +
|
| +[[rule]]
|
| +name = "CTFE Staging Prober (history)"
|
| +message = "The page at https://ct-staging.skia.org/history/ is unavailable."
|
| +query = "select mean(value) from /prober.ctfe_staging_history.failure.value/ where time > now() - 10m;"
|
| +category = "infra"
|
| +condition = "x >= 1"
|
| +actions = ["Email(infra-alerts@skia.org)"]
|
| +auto-dismiss = true
|
| +nag = "1h"
|
| +
|
| +[[rule]]
|
| +name = "CTFE Staging Prober (chromium_perf_runs)"
|
| +message = "The page at https://ct-staging.skia.org/chromium_perf_runs/ is unavailable."
|
| +query = "select mean(value) from /prober.ctfe_staging_chromium_perf_runs.failure.value/ where time > now() - 10m;"
|
| +category = "infra"
|
| +condition = "x >= 1"
|
| +actions = ["Email(infra-alerts@skia.org)"]
|
| +auto-dismiss = true
|
| +nag = "1h"
|
| +
|
| +[[rule]]
|
| +name = "CTFE Staging Prober (capture_skp_runs)"
|
| +message = "The page at https://ct-staging.skia.org/capture_skp_runs/ is unavailable."
|
| +query = "select mean(value) from /prober.ctfe_staging_capture_skp_runs.failure.value/ where time > now() - 10m;"
|
| +category = "infra"
|
| +condition = "x >= 1"
|
| +actions = ["Email(infra-alerts@skia.org)"]
|
| +auto-dismiss = true
|
| +nag = "1h"
|
| +
|
| +[[rule]]
|
| +name = "CTFE Staging Prober (lua_script_runs)"
|
| +message = "The page at https://ct-staging.skia.org/lua_script_runs/ is unavailable."
|
| +query = "select mean(value) from /prober.ctfe_staging_lua_script_runs.failure.value/ where time > now() - 10m;"
|
| +category = "infra"
|
| +condition = "x >= 1"
|
| +actions = ["Email(infra-alerts@skia.org)"]
|
| +auto-dismiss = true
|
| +nag = "1h"
|
| +
|
| +[[rule]]
|
| +name = "CTFE Staging Prober (chromium_builds_runs)"
|
| +message = "The page at https://ct-staging.skia.org/chromium_builds_runs/ is unavailable."
|
| +query = "select mean(value) from /prober.ctfe_staging_chromium_builds_runs.failure.value/ where time > now() - 10m;"
|
| +category = "infra"
|
| +condition = "x >= 1"
|
| +actions = ["Email(infra-alerts@skia.org)"]
|
| +auto-dismiss = true
|
| +nag = "1h"
|
| +
|
| +[[rule]]
|
| +name = "CTFE Staging Prober (recreate_page_sets_runs)"
|
| +message = "The page at https://ct-staging.skia.org/recreate_page_sets_runs/ is unavailable."
|
| +query = "select mean(value) from /prober.ctfe_staging_recreate_page_sets_runs.failure.value/ where time > now() - 10m;"
|
| +category = "infra"
|
| +condition = "x >= 1"
|
| +actions = ["Email(infra-alerts@skia.org)"]
|
| +auto-dismiss = true
|
| +nag = "1h"
|
| +
|
| +[[rule]]
|
| +name = "CTFE Staging Prober (recreate_webpage_archives_runs)"
|
| +message = "The page at https://ct-staging.skia.org/recreate_webpage_archives_runs/ is unavailable."
|
| +query = "select mean(value) from /prober.ctfe_staging_recreate_webpage_archives_runs.failure.value/ where time > now() - 10m;"
|
| +category = "infra"
|
| +condition = "x >= 1"
|
| +actions = ["Email(infra-alerts@skia.org)"]
|
| +auto-dismiss = true
|
| +nag = "1h"
|
| +
|
| +[[rule]]
|
| +name = "CTFE Staging Prober (chromium_perf_parameters)"
|
| +message = "The JSON endpoint at https://ct-staging.skia.org/_/chromium_perf/ is unavailable or returning unexpected data."
|
| +query = "select mean(value) from /prober.ctfe_staging_chromium_perf_parameters.failure.value/ where time > now() - 10m;"
|
| +category = "infra"
|
| +condition = "x >= 1"
|
| +actions = ["Email(infra-alerts@skia.org)"]
|
| +auto-dismiss = true
|
| +nag = "1h"
|
| +
|
| +[[rule]]
|
| +name = "CTFE Staging Prober (chromium_rev_data)"
|
| +message = "The JSON endpoint at https://ct-staging.skia.org/_/chromium_rev_data?rev=LKGR is unavailable or returning unexpected data."
|
| +query = "select mean(value) from /prober.ctfe_staging_chromium_rev_data.failure.value/ where time > now() - 10m;"
|
| +category = "infra"
|
| +condition = "x >= 1"
|
| +actions = ["Email(infra-alerts@skia.org)"]
|
| +auto-dismiss = true
|
| +nag = "1h"
|
| +
|
| +[[rule]]
|
| +name = "CTFE Staging Prober (skia_rev_data)"
|
| +message = "The JSON endpoint at https://ct-staging.skia.org/_/skia_rev_data?rev=LKGR is unavailable or returning unexpected data."
|
| +query = "select mean(value) from /prober.ctfe_staging_skia_rev_data.failure.value/ where time > now() - 10m;"
|
| +category = "infra"
|
| +condition = "x >= 1"
|
| +actions = ["Email(infra-alerts@skia.org)"]
|
| +auto-dismiss = true
|
| +nag = "1h"
|
| +
|
| +[[rule]]
|
| +name = "CTFE Staging Prober (get_chromium_perf_tasks)"
|
| +message = "The JSON endpoint at https://ct-staging.skia.org/_/get_chromium_perf_tasks?size=2 is unavailable or returning unexpected data."
|
| +query = "select mean(value) from /prober.ctfe_staging_get_chromium_perf_tasks.failure.value/ where time > now() - 10m;"
|
| +category = "infra"
|
| +condition = "x >= 1"
|
| +actions = ["Email(infra-alerts@skia.org)"]
|
| +auto-dismiss = true
|
| +nag = "1h"
|
| +
|
| +[[rule]]
|
| +name = "CTFE Staging Prober (get_capture_skp_tasks)"
|
| +message = "The JSON endpoint at https://ct-staging.skia.org/_/get_capture_skp_tasks?size=2 is unavailable or returning unexpected data."
|
| +query = "select mean(value) from /prober.ctfe_staging_get_capture_skp_tasks.failure.value/ where time > now() - 10m;"
|
| +category = "infra"
|
| +condition = "x >= 1"
|
| +actions = ["Email(infra-alerts@skia.org)"]
|
| +auto-dismiss = true
|
| +nag = "1h"
|
| +
|
| +[[rule]]
|
| +name = "CTFE Staging Prober (get_lua_script_tasks)"
|
| +message = "The JSON endpoint at https://ct-staging.skia.org/_/get_lua_script_tasks?size=2 is unavailable or returning unexpected data."
|
| +query = "select mean(value) from /prober.ctfe_staging_get_lua_script_tasks.failure.value/ where time > now() - 10m;"
|
| +category = "infra"
|
| +condition = "x >= 1"
|
| +actions = ["Email(infra-alerts@skia.org)"]
|
| +auto-dismiss = true
|
| +nag = "1h"
|
| +
|
| +[[rule]]
|
| +name = "CTFE Staging Prober (get_chromium_build_tasks)"
|
| +message = "The JSON endpoint at https://ct-staging.skia.org/_/get_chromium_build_tasks?size=2 is unavailable or returning unexpected data."
|
| +query = "select mean(value) from /prober.ctfe_staging_get_chromium_build_tasks.failure.value/ where time > now() - 10m;"
|
| +category = "infra"
|
| +condition = "x >= 1"
|
| +actions = ["Email(infra-alerts@skia.org)"]
|
| +auto-dismiss = true
|
| +nag = "1h"
|
| +
|
| +[[rule]]
|
| +name = "CTFE Staging Prober (get_recreate_page_sets_tasks)"
|
| +message = "The JSON endpoint at https://ct-staging.skia.org/_/get_recreate_page_sets_tasks?size=2 is unavailable or returning unexpected data."
|
| +query = "select mean(value) from /prober.ctfe_staging_get_recreate_page_sets_tasks.failure.value/ where time > now() - 10m;"
|
| +category = "infra"
|
| +condition = "x >= 1"
|
| +actions = ["Email(infra-alerts@skia.org)"]
|
| +auto-dismiss = true
|
| +nag = "1h"
|
| +
|
| +[[rule]]
|
| +name = "CTFE Staging Prober (get_recreate_webpage_archives_tasks)"
|
| +message = "The JSON endpoint at https://ct-staging.skia.org/_/get_recreate_webpage_archives_tasks?size=2 is unavailable or returning unexpected data."
|
| +query = "select mean(value) from /prober.ctfe_staging_get_recreate_webpage_archives_tasks.failure.value/ where time > now() - 10m;"
|
| +category = "infra"
|
| +condition = "x >= 1"
|
| +actions = ["Email(infra-alerts@skia.org)"]
|
| +auto-dismiss = true
|
| +nag = "1h"
|
| +
|
| +[[rule]]
|
| +name = "CTFE Staging Prober (get_oldest_pending_task)"
|
| +message = "The JSON endpoint at https://ct-staging.skia.org/_/get_oldest_pending_task is unavailable or returning unexpected data."
|
| +query = "select mean(value) from /prober.ctfe_staging_get_oldest_pending_task.failure.value/ where time > now() - 10m;"
|
| +category = "infra"
|
| +condition = "x >= 1"
|
| +actions = ["Email(infra-alerts@skia.org)"]
|
| +auto-dismiss = true
|
| +nag = "1h"
|
| +
|
| +[[rule]]
|
| +name = "CTFE Staging Prober (any_skp_repository_available)"
|
| +message = "There are no SKP repositories available for running Lua scripts."
|
| +query = "select mean(value) from /prober.ctfe_staging_any_skp_repository_available.failure.value/ where time > now() - 60m;"
|
| +category = "infra"
|
| +condition = "x >= 1"
|
| +actions = ["Email(infra-alerts@skia.org)"]
|
| +auto-dismiss = true
|
| +nag = "24h"
|
| +
|
| +[[rule]]
|
| +name = "CTFE Staging Prober (any_chromium_builds_available)"
|
| +message = "There are no Chromium builds available for running tasks."
|
| +query = "select mean(value) from /prober.ctfe_staging_any_chromium_builds_available.failure.value/ where time > now() - 60m;"
|
| +category = "infra"
|
| +condition = "x >= 1"
|
| +actions = ["Email(infra-alerts@skia.org)"]
|
| +auto-dismiss = true
|
| +nag = "24h"
|
| +
|
| +[[rule]]
|
| +name = "CTFE Staging Pending Task Count"
|
| +message = "There are a lot of pending tasks."
|
| +query = "select mean(value) from /ctfe.ct-staging-skia-org.num-pending-tasks.value/ where time > now() - 10m;"
|
| +category = "infra"
|
| +condition = "x >= 100"
|
| +actions = ["Email(infra-alerts@skia.org)"]
|
| +auto-dismiss = true
|
| +nag = "1h"
|
| +
|
| +[[rule]]
|
| +name = "CTFE Staging Pending Task Status"
|
| +message = "A task has been waiting to be executed for a while and it's still not started."
|
| +query = "select mean(value) from /ctfe.ct-staging-skia-org.oldest-pending-task-status.value/ where time > now() - 10m;"
|
| +category = "infra"
|
| +condition = "x >= 2"
|
| +actions = ["Email(infra-alerts@skia.org)"]
|
| +auto-dismiss = true
|
| +nag = "1h"
|
| +
|
| +[[rule]]
|
| +name = "CTFE Staging Last Metrics Update"
|
| +message = "No recent update from the CTFE metrics goroutine."
|
| +query = "select count(value) from /ctfe.ct-staging-skia-org.oldest-pending-task-status.value/ where time > now() - 10m;"
|
| +category = "infra"
|
| +condition = "x < 1"
|
| +actions = ["Email(infra-alerts@skia.org)"]
|
| +auto-dismiss = true
|
| +nag = "1h"
|
| +
|
| +[[rule]]
|
| +name = "CTFE Staging Number of Goroutines"
|
| +message = "There are more goroutines running than expected."
|
| +query = "select mean(value) from /ctfe.ct-staging-skia-org.runtime.NumGoroutine.value/ where time > now() - 10m;"
|
| +category = "infra"
|
| +condition = "x >= 100"
|
| +actions = ["Email(infra-alerts@skia.org)"]
|
| +auto-dismiss = true
|
| +nag = "1h"
|
| +
|
| +[[rule]]
|
| +name = "CTFE Staging Error Rate"
|
| +message = "The error rate is too high."
|
| +query = "select derivative(value) from /^logserver.skia-ctfe.skia-ctfe.logserver.ctfe.ERROR.value$/ where time > now() - 10m"
|
| +category = "infra"
|
| +condition = "x >= 5"
|
| +actions = ["Email(infra-alerts@skia.org)"]
|
| +auto-dismiss = false
|
| +nag = "1h"
|
|
|