Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(1862)

Unified Diff: alertserver/alerts.cfg

Issue 1307333002: Add alerting for CTFE V2. (Closed) Base URL: https://skia.googlesource.com/buildbot@master
Patch Set: Rebase. Created 5 years, 4 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « no previous file | ct/go/ctfe/main.go » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: alertserver/alerts.cfg
diff --git a/alertserver/alerts.cfg b/alertserver/alerts.cfg
index 80e4f36906c269a58bce05a30bba7cad90f3fdbd..0608f6d910d2d4fd3a28f4f3e84731c372595153 100644
--- a/alertserver/alerts.cfg
+++ b/alertserver/alerts.cfg
@@ -389,6 +389,16 @@ actions = ["Email(infra-alerts@skia.org)"]
auto-dismiss = true
nag = "1h"
+[[rule]]
+name = "Low Disk Space (skia-ctfe)"
+message = "Free space has fallen below 1GB on skia-ctfe (root)."
+query = "select mean(value) from /collectd.skia-ctfe.df-root.df_complex-free/ where time > now() - 5m;"
+category = "infra"
+condition = "x <= 1e9"
+actions = ["Email(infra-alerts@skia.org)"]
+auto-dismiss = true
+nag = "1h"
+
#
# Skia Status
#
@@ -524,3 +534,317 @@ condition = "x >= 600"
actions = ["Email(infra-alerts@skia.org)"]
auto-dismiss = true
nag = "1h"
+
+#
+# CTFE
+#
+
+[[rule]]
+name = "CTFE Staging Prober (main page)"
+message = "The main page at http://ct-staging.skia.org is unavailable."
+query = "select mean(value) from /prober.ctfe_staging.failure.value/ where time > now() - 10m;"
+category = "infra"
+condition = "x >= 1"
+actions = ["Email(infra-alerts@skia.org)"]
+auto-dismiss = true
+nag = "1h"
+
+[[rule]]
+name = "CTFE Staging Prober (chromium_perf)"
+message = "The page at https://ct-staging.skia.org/chromium_perf/ is unavailable."
+query = "select mean(value) from /prober.ctfe_staging_chromium_perf.failure.value/ where time > now() - 10m;"
+category = "infra"
+condition = "x >= 1"
+actions = ["Email(infra-alerts@skia.org)"]
+auto-dismiss = true
+nag = "1h"
+
+[[rule]]
+name = "CTFE Staging Prober (capture_skps)"
+message = "The page at https://ct-staging.skia.org/capture_skps/ is unavailable."
+query = "select mean(value) from /prober.ctfe_staging_capture_skps.failure.value/ where time > now() - 10m;"
+category = "infra"
+condition = "x >= 1"
+actions = ["Email(infra-alerts@skia.org)"]
+auto-dismiss = true
+nag = "1h"
+
+[[rule]]
+name = "CTFE Staging Prober (lua_script)"
+message = "The page at https://ct-staging.skia.org/lua_script/ is unavailable."
+query = "select mean(value) from /prober.ctfe_staging_lua_script.failure.value/ where time > now() - 10m;"
+category = "infra"
+condition = "x >= 1"
+actions = ["Email(infra-alerts@skia.org)"]
+auto-dismiss = true
+nag = "1h"
+
+[[rule]]
+name = "CTFE Staging Prober (chromium_builds)"
+message = "The page at https://ct-staging.skia.org/chromium_builds/ is unavailable."
+query = "select mean(value) from /prober.ctfe_staging_chromium_builds.failure.value/ where time > now() - 10m;"
+category = "infra"
+condition = "x >= 1"
+actions = ["Email(infra-alerts@skia.org)"]
+auto-dismiss = true
+nag = "1h"
+
+[[rule]]
+name = "CTFE Staging Prober (admin_tasks)"
+message = "The page at https://ct-staging.skia.org/admin_tasks/ is unavailable."
+query = "select mean(value) from /prober.ctfe_staging_admin_tasks.failure.value/ where time > now() - 10m;"
+category = "infra"
+condition = "x >= 1"
+actions = ["Email(infra-alerts@skia.org)"]
+auto-dismiss = true
+nag = "1h"
+
+[[rule]]
+name = "CTFE Staging Prober (queue)"
+message = "The page at https://ct-staging.skia.org/queue/ is unavailable."
+query = "select mean(value) from /prober.ctfe_staging_queue.failure.value/ where time > now() - 10m;"
+category = "infra"
+condition = "x >= 1"
+actions = ["Email(infra-alerts@skia.org)"]
+auto-dismiss = true
+nag = "1h"
+
+[[rule]]
+name = "CTFE Staging Prober (history)"
+message = "The page at https://ct-staging.skia.org/history/ is unavailable."
+query = "select mean(value) from /prober.ctfe_staging_history.failure.value/ where time > now() - 10m;"
+category = "infra"
+condition = "x >= 1"
+actions = ["Email(infra-alerts@skia.org)"]
+auto-dismiss = true
+nag = "1h"
+
+[[rule]]
+name = "CTFE Staging Prober (chromium_perf_runs)"
+message = "The page at https://ct-staging.skia.org/chromium_perf_runs/ is unavailable."
+query = "select mean(value) from /prober.ctfe_staging_chromium_perf_runs.failure.value/ where time > now() - 10m;"
+category = "infra"
+condition = "x >= 1"
+actions = ["Email(infra-alerts@skia.org)"]
+auto-dismiss = true
+nag = "1h"
+
+[[rule]]
+name = "CTFE Staging Prober (capture_skp_runs)"
+message = "The page at https://ct-staging.skia.org/capture_skp_runs/ is unavailable."
+query = "select mean(value) from /prober.ctfe_staging_capture_skp_runs.failure.value/ where time > now() - 10m;"
+category = "infra"
+condition = "x >= 1"
+actions = ["Email(infra-alerts@skia.org)"]
+auto-dismiss = true
+nag = "1h"
+
+[[rule]]
+name = "CTFE Staging Prober (lua_script_runs)"
+message = "The page at https://ct-staging.skia.org/lua_script_runs/ is unavailable."
+query = "select mean(value) from /prober.ctfe_staging_lua_script_runs.failure.value/ where time > now() - 10m;"
+category = "infra"
+condition = "x >= 1"
+actions = ["Email(infra-alerts@skia.org)"]
+auto-dismiss = true
+nag = "1h"
+
+[[rule]]
+name = "CTFE Staging Prober (chromium_builds_runs)"
+message = "The page at https://ct-staging.skia.org/chromium_builds_runs/ is unavailable."
+query = "select mean(value) from /prober.ctfe_staging_chromium_builds_runs.failure.value/ where time > now() - 10m;"
+category = "infra"
+condition = "x >= 1"
+actions = ["Email(infra-alerts@skia.org)"]
+auto-dismiss = true
+nag = "1h"
+
+[[rule]]
+name = "CTFE Staging Prober (recreate_page_sets_runs)"
+message = "The page at https://ct-staging.skia.org/recreate_page_sets_runs/ is unavailable."
+query = "select mean(value) from /prober.ctfe_staging_recreate_page_sets_runs.failure.value/ where time > now() - 10m;"
+category = "infra"
+condition = "x >= 1"
+actions = ["Email(infra-alerts@skia.org)"]
+auto-dismiss = true
+nag = "1h"
+
+[[rule]]
+name = "CTFE Staging Prober (recreate_webpage_archives_runs)"
+message = "The page at https://ct-staging.skia.org/recreate_webpage_archives_runs/ is unavailable."
+query = "select mean(value) from /prober.ctfe_staging_recreate_webpage_archives_runs.failure.value/ where time > now() - 10m;"
+category = "infra"
+condition = "x >= 1"
+actions = ["Email(infra-alerts@skia.org)"]
+auto-dismiss = true
+nag = "1h"
+
+[[rule]]
+name = "CTFE Staging Prober (chromium_perf_parameters)"
+message = "The JSON endpoint at https://ct-staging.skia.org/_/chromium_perf/ is unavailable or returning unexpected data."
+query = "select mean(value) from /prober.ctfe_staging_chromium_perf_parameters.failure.value/ where time > now() - 10m;"
+category = "infra"
+condition = "x >= 1"
+actions = ["Email(infra-alerts@skia.org)"]
+auto-dismiss = true
+nag = "1h"
+
+[[rule]]
+name = "CTFE Staging Prober (chromium_rev_data)"
+message = "The JSON endpoint at https://ct-staging.skia.org/_/chromium_rev_data?rev=LKGR is unavailable or returning unexpected data."
+query = "select mean(value) from /prober.ctfe_staging_chromium_rev_data.failure.value/ where time > now() - 10m;"
+category = "infra"
+condition = "x >= 1"
+actions = ["Email(infra-alerts@skia.org)"]
+auto-dismiss = true
+nag = "1h"
+
+[[rule]]
+name = "CTFE Staging Prober (skia_rev_data)"
+message = "The JSON endpoint at https://ct-staging.skia.org/_/skia_rev_data?rev=LKGR is unavailable or returning unexpected data."
+query = "select mean(value) from /prober.ctfe_staging_skia_rev_data.failure.value/ where time > now() - 10m;"
+category = "infra"
+condition = "x >= 1"
+actions = ["Email(infra-alerts@skia.org)"]
+auto-dismiss = true
+nag = "1h"
+
+[[rule]]
+name = "CTFE Staging Prober (get_chromium_perf_tasks)"
+message = "The JSON endpoint at https://ct-staging.skia.org/_/get_chromium_perf_tasks?size=2 is unavailable or returning unexpected data."
+query = "select mean(value) from /prober.ctfe_staging_get_chromium_perf_tasks.failure.value/ where time > now() - 10m;"
+category = "infra"
+condition = "x >= 1"
+actions = ["Email(infra-alerts@skia.org)"]
+auto-dismiss = true
+nag = "1h"
+
+[[rule]]
+name = "CTFE Staging Prober (get_capture_skp_tasks)"
+message = "The JSON endpoint at https://ct-staging.skia.org/_/get_capture_skp_tasks?size=2 is unavailable or returning unexpected data."
+query = "select mean(value) from /prober.ctfe_staging_get_capture_skp_tasks.failure.value/ where time > now() - 10m;"
+category = "infra"
+condition = "x >= 1"
+actions = ["Email(infra-alerts@skia.org)"]
+auto-dismiss = true
+nag = "1h"
+
+[[rule]]
+name = "CTFE Staging Prober (get_lua_script_tasks)"
+message = "The JSON endpoint at https://ct-staging.skia.org/_/get_lua_script_tasks?size=2 is unavailable or returning unexpected data."
+query = "select mean(value) from /prober.ctfe_staging_get_lua_script_tasks.failure.value/ where time > now() - 10m;"
+category = "infra"
+condition = "x >= 1"
+actions = ["Email(infra-alerts@skia.org)"]
+auto-dismiss = true
+nag = "1h"
+
+[[rule]]
+name = "CTFE Staging Prober (get_chromium_build_tasks)"
+message = "The JSON endpoint at https://ct-staging.skia.org/_/get_chromium_build_tasks?size=2 is unavailable or returning unexpected data."
+query = "select mean(value) from /prober.ctfe_staging_get_chromium_build_tasks.failure.value/ where time > now() - 10m;"
+category = "infra"
+condition = "x >= 1"
+actions = ["Email(infra-alerts@skia.org)"]
+auto-dismiss = true
+nag = "1h"
+
+[[rule]]
+name = "CTFE Staging Prober (get_recreate_page_sets_tasks)"
+message = "The JSON endpoint at https://ct-staging.skia.org/_/get_recreate_page_sets_tasks?size=2 is unavailable or returning unexpected data."
+query = "select mean(value) from /prober.ctfe_staging_get_recreate_page_sets_tasks.failure.value/ where time > now() - 10m;"
+category = "infra"
+condition = "x >= 1"
+actions = ["Email(infra-alerts@skia.org)"]
+auto-dismiss = true
+nag = "1h"
+
+[[rule]]
+name = "CTFE Staging Prober (get_recreate_webpage_archives_tasks)"
+message = "The JSON endpoint at https://ct-staging.skia.org/_/get_recreate_webpage_archives_tasks?size=2 is unavailable or returning unexpected data."
+query = "select mean(value) from /prober.ctfe_staging_get_recreate_webpage_archives_tasks.failure.value/ where time > now() - 10m;"
+category = "infra"
+condition = "x >= 1"
+actions = ["Email(infra-alerts@skia.org)"]
+auto-dismiss = true
+nag = "1h"
+
+[[rule]]
+name = "CTFE Staging Prober (get_oldest_pending_task)"
+message = "The JSON endpoint at https://ct-staging.skia.org/_/get_oldest_pending_task is unavailable or returning unexpected data."
+query = "select mean(value) from /prober.ctfe_staging_get_oldest_pending_task.failure.value/ where time > now() - 10m;"
+category = "infra"
+condition = "x >= 1"
+actions = ["Email(infra-alerts@skia.org)"]
+auto-dismiss = true
+nag = "1h"
+
+[[rule]]
+name = "CTFE Staging Prober (any_skp_repository_available)"
+message = "There are no SKP repositories available for running Lua scripts."
+query = "select mean(value) from /prober.ctfe_staging_any_skp_repository_available.failure.value/ where time > now() - 60m;"
+category = "infra"
+condition = "x >= 1"
+actions = ["Email(infra-alerts@skia.org)"]
+auto-dismiss = true
+nag = "24h"
+
+[[rule]]
+name = "CTFE Staging Prober (any_chromium_builds_available)"
+message = "There are no Chromium builds available for running tasks."
+query = "select mean(value) from /prober.ctfe_staging_any_chromium_builds_available.failure.value/ where time > now() - 60m;"
+category = "infra"
+condition = "x >= 1"
+actions = ["Email(infra-alerts@skia.org)"]
+auto-dismiss = true
+nag = "24h"
+
+[[rule]]
+name = "CTFE Staging Pending Task Count"
+message = "There are a lot of pending tasks."
+query = "select mean(value) from /ctfe.ct-staging-skia-org.num-pending-tasks.value/ where time > now() - 10m;"
+category = "infra"
+condition = "x >= 100"
+actions = ["Email(infra-alerts@skia.org)"]
+auto-dismiss = true
+nag = "1h"
+
+[[rule]]
+name = "CTFE Staging Pending Task Status"
+message = "A task has been waiting to be executed for a while and it's still not started."
+query = "select mean(value) from /ctfe.ct-staging-skia-org.oldest-pending-task-status.value/ where time > now() - 10m;"
+category = "infra"
+condition = "x >= 2"
+actions = ["Email(infra-alerts@skia.org)"]
+auto-dismiss = true
+nag = "1h"
+
+[[rule]]
+name = "CTFE Staging Last Metrics Update"
+message = "No recent update from the CTFE metrics goroutine."
+query = "select count(value) from /ctfe.ct-staging-skia-org.oldest-pending-task-status.value/ where time > now() - 10m;"
+category = "infra"
+condition = "x < 1"
+actions = ["Email(infra-alerts@skia.org)"]
+auto-dismiss = true
+nag = "1h"
+
+[[rule]]
+name = "CTFE Staging Number of Goroutines"
+message = "There are more goroutines running than expected."
+query = "select mean(value) from /ctfe.ct-staging-skia-org.runtime.NumGoroutine.value/ where time > now() - 10m;"
+category = "infra"
+condition = "x >= 100"
+actions = ["Email(infra-alerts@skia.org)"]
+auto-dismiss = true
+nag = "1h"
+
+[[rule]]
+name = "CTFE Staging Error Rate"
+message = "The error rate is too high."
+query = "select derivative(value) from /^logserver.skia-ctfe.skia-ctfe.logserver.ctfe.ERROR.value$/ where time > now() - 10m"
+category = "infra"
+condition = "x >= 5"
+actions = ["Email(infra-alerts@skia.org)"]
+auto-dismiss = false
+nag = "1h"
« no previous file with comments | « no previous file | ct/go/ctfe/main.go » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698