Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(107)

Side by Side Diff: alertserver/alerts.cfg

Issue 1307333002: Add alerting for CTFE V2. (Closed) Base URL: https://skia.googlesource.com/buildbot@master
Patch Set: Created 5 years, 3 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « no previous file | ct/go/ctfe/main.go » ('j') | ct/go/ctfe/main.go » ('J')
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 # This file defines alerts to be triggered by the server. 1 # This file defines alerts to be triggered by the server.
2 2
3 # 3 #
4 # SkiaPerf and SkiaGold 4 # SkiaPerf and SkiaGold
5 # 5 #
6 6
7 [[rule]] 7 [[rule]]
8 name = "Perf Alerts" 8 name = "Perf Alerts"
9 message = "At least one perf alert has been found. Please visit https://perf.ski a.org/alerts/ to triage." 9 message = "At least one perf alert has been found. Please visit https://perf.ski a.org/alerts/ to triage."
10 query = "select value from /skiaperf.skia-perf.alerting.new.value/ limit 1" 10 query = "select value from /skiaperf.skia-perf.alerting.new.value/ limit 1"
(...skipping 371 matching lines...) Expand 10 before | Expand all | Expand 10 after
382 [[rule]] 382 [[rule]]
383 name = "Low Disk Space (skia-status /mnt/pd0)" 383 name = "Low Disk Space (skia-status /mnt/pd0)"
384 message = "Free space has fallen below 5GB on skia-status (/mnt/pd0)." 384 message = "Free space has fallen below 5GB on skia-status (/mnt/pd0)."
385 query = "select mean(value) from /collectd.skia-status.df-mnt-pd0.df_complex-fre e/ where time > now() - 5m;" 385 query = "select mean(value) from /collectd.skia-status.df-mnt-pd0.df_complex-fre e/ where time > now() - 5m;"
386 category = "infra" 386 category = "infra"
387 condition = "x <= 5e9" 387 condition = "x <= 5e9"
388 actions = ["Email(infra-alerts@skia.org)"] 388 actions = ["Email(infra-alerts@skia.org)"]
389 auto-dismiss = true 389 auto-dismiss = true
390 nag = "1h" 390 nag = "1h"
391 391
392 [[rule]]
393 name = "Low Disk Space (skia-ctfe)"
394 message = "Free space has fallen below 1GB on skia-ctfe (root)."
395 query = "select mean(value) from /collectd.skia-ctfe.df-root.df_complex-free/ wh ere time > now() - 5m;"
396 category = "infra"
397 condition = "x <= 1e9"
398 actions = ["Email(infra-alerts@skia.org)"]
399 auto-dismiss = true
400 nag = "1h"
401
392 # 402 #
393 # Skia Status 403 # Skia Status
394 # 404 #
395 405
396 [[rule]] 406 [[rule]]
397 name = "Skia Status Prober (main page)" 407 name = "Skia Status Prober (main page)"
398 message = "The main page at https://status.skia.org has failed." 408 message = "The main page at https://status.skia.org has failed."
399 query = "select mean(value) from /prober.skiastatus.failure.value/ where time > now() - 10m;" 409 query = "select mean(value) from /prober.skiastatus.failure.value/ where time > now() - 10m;"
400 category = "infra" 410 category = "infra"
401 condition = "x >= 1" 411 condition = "x >= 1"
(...skipping 115 matching lines...) Expand 10 before | Expand all | Expand 10 after
517 527
518 [[rule]] 528 [[rule]]
519 name = "Datahopper Buildbot Ingestion Stalled (client.skia.fyi)" 529 name = "Datahopper Buildbot Ingestion Stalled (client.skia.fyi)"
520 message = "Buildbot Ingestion in Datahopper has failed to run in at least 10 min utes for client.skia.fyi." 530 message = "Buildbot Ingestion in Datahopper has failed to run in at least 10 min utes for client.skia.fyi."
521 query = "select mean(value) from /datahopper.skia-datahopper.buildbot-ingest-cli ent.skia.fyi.time-since-last-successful-update.value/ where time > now() - 10m" 531 query = "select mean(value) from /datahopper.skia-datahopper.buildbot-ingest-cli ent.skia.fyi.time-since-last-successful-update.value/ where time > now() - 10m"
522 category = "infra" 532 category = "infra"
523 condition = "x >= 600" 533 condition = "x >= 600"
524 actions = ["Email(infra-alerts@skia.org)"] 534 actions = ["Email(infra-alerts@skia.org)"]
525 auto-dismiss = true 535 auto-dismiss = true
526 nag = "1h" 536 nag = "1h"
537
538 #
539 # CTFE
540 #
541
542 [[rule]]
543 name = "CTFE Staging Prober (main page)"
544 message = "The main page at http://ct-staging.skia.org is unavailable."
545 query = "select mean(value) from /prober.ctfe_staging.failure.value/ where time > now() - 10m;"
546 category = "infra"
547 condition = "x >= 1"
548 actions = ["Email(infra-alerts@skia.org)"]
549 auto-dismiss = true
550 nag = "1h"
551
552 [[rule]]
553 name = "CTFE Staging Prober (chromium_perf)"
554 message = "The page at https://ct-staging.skia.org/chromium_perf/ is unavailable ."
555 query = "select mean(value) from /prober.ctfe_staging_chromium_perf.failure.valu e/ where time > now() - 10m;"
556 category = "infra"
557 condition = "x >= 1"
558 actions = ["Email(infra-alerts@skia.org)"]
559 auto-dismiss = true
560 nag = "1h"
561
562 [[rule]]
563 name = "CTFE Staging Prober (capture_skps)"
564 message = "The page at https://ct-staging.skia.org/capture_skps/ is unavailable. "
565 query = "select mean(value) from /prober.ctfe_staging_capture_skps.failure.value / where time > now() - 10m;"
566 category = "infra"
567 condition = "x >= 1"
568 actions = ["Email(infra-alerts@skia.org)"]
569 auto-dismiss = true
570 nag = "1h"
571
572 [[rule]]
573 name = "CTFE Staging Prober (lua_script)"
574 message = "The page at https://ct-staging.skia.org/lua_script/ is unavailable."
575 query = "select mean(value) from /prober.ctfe_staging_lua_script.failure.value/ where time > now() - 10m;"
576 category = "infra"
577 condition = "x >= 1"
578 actions = ["Email(infra-alerts@skia.org)"]
579 auto-dismiss = true
580 nag = "1h"
581
582 [[rule]]
583 name = "CTFE Staging Prober (chromium_builds)"
584 message = "The page at https://ct-staging.skia.org/chromium_builds/ is unavailab le."
585 query = "select mean(value) from /prober.ctfe_staging_chromium_builds.failure.va lue/ where time > now() - 10m;"
586 category = "infra"
587 condition = "x >= 1"
588 actions = ["Email(infra-alerts@skia.org)"]
589 auto-dismiss = true
590 nag = "1h"
591
592 [[rule]]
593 name = "CTFE Staging Prober (admin_tasks)"
594 message = "The page at https://ct-staging.skia.org/admin_tasks/ is unavailable."
595 query = "select mean(value) from /prober.ctfe_staging_admin_tasks.failure.value/ where time > now() - 10m;"
596 category = "infra"
597 condition = "x >= 1"
598 actions = ["Email(infra-alerts@skia.org)"]
599 auto-dismiss = true
600 nag = "1h"
601
602 [[rule]]
603 name = "CTFE Staging Prober (queue)"
604 message = "The page at https://ct-staging.skia.org/queue/ is unavailable."
605 query = "select mean(value) from /prober.ctfe_staging_queue.failure.value/ where time > now() - 10m;"
606 category = "infra"
607 condition = "x >= 1"
608 actions = ["Email(infra-alerts@skia.org)"]
609 auto-dismiss = true
610 nag = "1h"
611
612 [[rule]]
613 name = "CTFE Staging Prober (history)"
614 message = "The page at https://ct-staging.skia.org/history/ is unavailable."
615 query = "select mean(value) from /prober.ctfe_staging_history.failure.value/ whe re time > now() - 10m;"
616 category = "infra"
617 condition = "x >= 1"
618 actions = ["Email(infra-alerts@skia.org)"]
619 auto-dismiss = true
620 nag = "1h"
621
622 [[rule]]
623 name = "CTFE Staging Prober (chromium_perf_runs)"
624 message = "The page at https://ct-staging.skia.org/chromium_perf_runs/ is unavai lable."
625 query = "select mean(value) from /prober.ctfe_staging_chromium_perf_runs.failure .value/ where time > now() - 10m;"
626 category = "infra"
627 condition = "x >= 1"
628 actions = ["Email(infra-alerts@skia.org)"]
629 auto-dismiss = true
630 nag = "1h"
631
632 [[rule]]
633 name = "CTFE Staging Prober (capture_skp_runs)"
634 message = "The page at https://ct-staging.skia.org/capture_skp_runs/ is unavaila ble."
635 query = "select mean(value) from /prober.ctfe_staging_capture_skp_runs.failure.v alue/ where time > now() - 10m;"
636 category = "infra"
637 condition = "x >= 1"
638 actions = ["Email(infra-alerts@skia.org)"]
639 auto-dismiss = true
640 nag = "1h"
641
642 [[rule]]
643 name = "CTFE Staging Prober (lua_script_runs)"
644 message = "The page at https://ct-staging.skia.org/lua_script_runs/ is unavailab le."
645 query = "select mean(value) from /prober.ctfe_staging_lua_script_runs.failure.va lue/ where time > now() - 10m;"
646 category = "infra"
647 condition = "x >= 1"
648 actions = ["Email(infra-alerts@skia.org)"]
649 auto-dismiss = true
650 nag = "1h"
651
652 [[rule]]
653 name = "CTFE Staging Prober (chromium_builds_runs)"
654 message = "The page at https://ct-staging.skia.org/chromium_builds_runs/ is unav ailable."
655 query = "select mean(value) from /prober.ctfe_staging_chromium_builds_runs.failu re.value/ where time > now() - 10m;"
656 category = "infra"
657 condition = "x >= 1"
658 actions = ["Email(infra-alerts@skia.org)"]
659 auto-dismiss = true
660 nag = "1h"
661
662 [[rule]]
663 name = "CTFE Staging Prober (recreate_page_sets_runs)"
664 message = "The page at https://ct-staging.skia.org/recreate_page_sets_runs/ is u navailable."
665 query = "select mean(value) from /prober.ctfe_staging_recreate_page_sets_runs.fa ilure.value/ where time > now() - 10m;"
666 category = "infra"
667 condition = "x >= 1"
668 actions = ["Email(infra-alerts@skia.org)"]
669 auto-dismiss = true
670 nag = "1h"
671
672 [[rule]]
673 name = "CTFE Staging Prober (recreate_webpage_archives_runs)"
674 message = "The page at https://ct-staging.skia.org/recreate_webpage_archives_run s/ is unavailable."
675 query = "select mean(value) from /prober.ctfe_staging_recreate_webpage_archives_ runs.failure.value/ where time > now() - 10m;"
676 category = "infra"
677 condition = "x >= 1"
678 actions = ["Email(infra-alerts@skia.org)"]
679 auto-dismiss = true
680 nag = "1h"
681
682 [[rule]]
683 name = "CTFE Staging Prober (chromium_perf_parameters)"
684 message = "The JSON endpoint at https://ct-staging.skia.org/_/chromium_perf/ is unavailable or returning unexpected data."
685 query = "select mean(value) from /prober.ctfe_staging_chromium_perf_parameters.f ailure.value/ where time > now() - 10m;"
686 category = "infra"
687 condition = "x >= 1"
688 actions = ["Email(infra-alerts@skia.org)"]
689 auto-dismiss = true
690 nag = "1h"
691
692 [[rule]]
693 name = "CTFE Staging Prober (chromium_rev_data)"
694 message = "The JSON endpoint at https://ct-staging.skia.org/_/chromium_rev_data? rev=LKGR is unavailable or returning unexpected data."
695 query = "select mean(value) from /prober.ctfe_staging_chromium_rev_data.failure. value/ where time > now() - 10m;"
696 category = "infra"
697 condition = "x >= 1"
698 actions = ["Email(infra-alerts@skia.org)"]
699 auto-dismiss = true
700 nag = "1h"
701
702 [[rule]]
703 name = "CTFE Staging Prober (skia_rev_data)"
704 message = "The JSON endpoint at https://ct-staging.skia.org/_/skia_rev_data?rev= LKGR is unavailable or returning unexpected data."
705 query = "select mean(value) from /prober.ctfe_staging_skia_rev_data.failure.valu e/ where time > now() - 10m;"
706 category = "infra"
707 condition = "x >= 1"
708 actions = ["Email(infra-alerts@skia.org)"]
709 auto-dismiss = true
710 nag = "1h"
711
712 [[rule]]
713 name = "CTFE Staging Prober (get_chromium_perf_tasks)"
714 message = "The JSON endpoint at https://ct-staging.skia.org/_/get_chromium_perf_ tasks?size=2 is unavailable or returning unexpected data."
715 query = "select mean(value) from /prober.ctfe_staging_get_chromium_perf_tasks.fa ilure.value/ where time > now() - 10m;"
716 category = "infra"
717 condition = "x >= 1"
718 actions = ["Email(infra-alerts@skia.org)"]
719 auto-dismiss = true
720 nag = "1h"
721
722 [[rule]]
723 name = "CTFE Staging Prober (get_capture_skp_tasks)"
724 message = "The JSON endpoint at https://ct-staging.skia.org/_/get_capture_skp_ta sks?size=2 is unavailable or returning unexpected data."
725 query = "select mean(value) from /prober.ctfe_staging_get_capture_skp_tasks.fail ure.value/ where time > now() - 10m;"
726 category = "infra"
727 condition = "x >= 1"
728 actions = ["Email(infra-alerts@skia.org)"]
729 auto-dismiss = true
730 nag = "1h"
731
732 [[rule]]
733 name = "CTFE Staging Prober (get_lua_script_tasks)"
734 message = "The JSON endpoint at https://ct-staging.skia.org/_/get_lua_script_tas ks?size=2 is unavailable or returning unexpected data."
735 query = "select mean(value) from /prober.ctfe_staging_get_lua_script_tasks.failu re.value/ where time > now() - 10m;"
736 category = "infra"
737 condition = "x >= 1"
738 actions = ["Email(infra-alerts@skia.org)"]
739 auto-dismiss = true
740 nag = "1h"
741
742 [[rule]]
743 name = "CTFE Staging Prober (get_chromium_build_tasks)"
744 message = "The JSON endpoint at https://ct-staging.skia.org/_/get_chromium_build _tasks?size=2 is unavailable or returning unexpected data."
745 query = "select mean(value) from /prober.ctfe_staging_get_chromium_build_tasks.f ailure.value/ where time > now() - 10m;"
746 category = "infra"
747 condition = "x >= 1"
748 actions = ["Email(infra-alerts@skia.org)"]
749 auto-dismiss = true
750 nag = "1h"
751
752 [[rule]]
753 name = "CTFE Staging Prober (get_recreate_page_sets_tasks)"
754 message = "The JSON endpoint at https://ct-staging.skia.org/_/get_recreate_page_ sets_tasks?size=2 is unavailable or returning unexpected data."
755 query = "select mean(value) from /prober.ctfe_staging_get_recreate_page_sets_tas ks.failure.value/ where time > now() - 10m;"
756 category = "infra"
757 condition = "x >= 1"
758 actions = ["Email(infra-alerts@skia.org)"]
759 auto-dismiss = true
760 nag = "1h"
761
762 [[rule]]
763 name = "CTFE Staging Prober (get_recreate_webpage_archives_tasks)"
764 message = "The JSON endpoint at https://ct-staging.skia.org/_/get_recreate_webpa ge_archives_tasks?size=2 is unavailable or returning unexpected data."
765 query = "select mean(value) from /prober.ctfe_staging_get_recreate_webpage_archi ves_tasks.failure.value/ where time > now() - 10m;"
766 category = "infra"
767 condition = "x >= 1"
768 actions = ["Email(infra-alerts@skia.org)"]
769 auto-dismiss = true
770 nag = "1h"
771
772 [[rule]]
773 name = "CTFE Staging Prober (get_oldest_pending_task)"
774 message = "The JSON endpoint at https://ct-staging.skia.org/_/get_oldest_pending _task is unavailable or returning unexpected data."
775 query = "select mean(value) from /prober.ctfe_staging_get_oldest_pending_task.fa ilure.value/ where time > now() - 10m;"
776 category = "infra"
777 condition = "x >= 1"
778 actions = ["Email(infra-alerts@skia.org)"]
779 auto-dismiss = true
780 nag = "1h"
781
782 [[rule]]
783 name = "CTFE Staging Prober (any_skp_repository_available)"
784 message = "There are no SKP repositories available for running Lua scripts."
785 query = "select mean(value) from /prober.ctfe_staging_any_skp_repository_availab le.failure.value/ where time > now() - 60m;"
786 category = "infra"
787 condition = "x >= 1"
788 actions = ["Email(infra-alerts@skia.org)"]
789 auto-dismiss = true
790 nag = "24h"
791
792 [[rule]]
793 name = "CTFE Staging Prober (any_chromium_builds_available)"
794 message = "There are no Chromium builds available for running tasks."
795 query = "select mean(value) from /prober.ctfe_staging_any_chromium_builds_availa ble.failure.value/ where time > now() - 60m;"
796 category = "infra"
797 condition = "x >= 1"
798 actions = ["Email(infra-alerts@skia.org)"]
799 auto-dismiss = true
800 nag = "24h"
801
802 [[rule]]
803 name = "CTFE Staging Pending Task Count"
804 message = "There are a lot of pending tasks."
805 query = "select mean(value) from /ctfe.ct-staging-skia-org.num-pending-tasks.val ue/ where time > now() - 10m;"
806 category = "infra"
807 condition = "x >= 100"
808 actions = ["Email(infra-alerts@skia.org)"]
809 auto-dismiss = true
810 nag = "1h"
811
812 [[rule]]
813 name = "CTFE Staging Pending Task Status"
814 message = "A task has been waiting to be executed for a while and it's still not started."
815 query = "select mean(value) from /ctfe.ct-staging-skia-org.oldest-pending-task-s tatus.value/ where time > now() - 10m;"
816 category = "infra"
817 condition = "x >= 2"
818 actions = ["Email(infra-alerts@skia.org)"]
819 auto-dismiss = true
820 nag = "1h"
821
822 [[rule]]
823 name = "CTFE Staging Last Metrics Update"
824 message = "No recent update from the CTFE metrics goroutine."
825 query = "select count(value) from /ctfe.ct-staging-skia-org.oldest-pending-task- status.value/ where time > now() - 10m;"
826 category = "infra"
827 condition = "x < 1"
828 actions = ["Email(infra-alerts@skia.org)"]
829 auto-dismiss = true
830 nag = "1h"
831
832 [[rule]]
833 name = "CTFE Staging Number of Goroutines"
834 message = "There are more goroutines running than expected."
835 query = "select mean(value) from /ctfe.ct-staging-skia-org.runtime.NumGoroutine. value/ where time > now() - 10m;"
836 category = "infra"
837 condition = "x >= 100"
838 actions = ["Email(infra-alerts@skia.org)"]
839 auto-dismiss = true
840 nag = "1h"
841
842 [[rule]]
843 name = "CTFE Staging Error Rate"
844 message = "The error rate is too high."
845 query = "select derivative(value) from /^logserver.skia-ctfe.skia-ctfe.logserver .ctfe.ERROR.value$/ where time > now() - 10m"
846 category = "infra"
847 condition = "x >= 5"
848 actions = ["Email(infra-alerts@skia.org)"]
849 auto-dismiss = false
850 nag = "1h"
OLDNEW
« no previous file with comments | « no previous file | ct/go/ctfe/main.go » ('j') | ct/go/ctfe/main.go » ('J')

Powered by Google App Engine
This is Rietveld 408576698