| OLD | NEW |
| 1 # This file defines alerts to be triggered by the server. | 1 # This file defines alerts to be triggered by the server. |
| 2 | 2 |
| 3 # | 3 # |
| 4 # SkiaPerf and SkiaGold | 4 # SkiaPerf and SkiaGold |
| 5 # | 5 # |
| 6 | 6 |
| 7 [[rule]] | 7 [[rule]] |
| 8 name = "Perf Alerts" | 8 name = "Perf Alerts" |
| 9 message = "At least one perf alert has been found. Please visit https://perf.ski
a.org/alerts/ to triage." | 9 message = "At least one perf alert has been found. Please visit https://perf.ski
a.org/alerts/ to triage." |
| 10 query = "select value from /skiaperf.skia-perf.alerting.new.value/ limit 1" | 10 query = "select value from /skiaperf.skia-perf.alerting.new.value/ limit 1" |
| (...skipping 371 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 382 [[rule]] | 382 [[rule]] |
| 383 name = "Low Disk Space (skia-status /mnt/pd0)" | 383 name = "Low Disk Space (skia-status /mnt/pd0)" |
| 384 message = "Free space has fallen below 5GB on skia-status (/mnt/pd0)." | 384 message = "Free space has fallen below 5GB on skia-status (/mnt/pd0)." |
| 385 query = "select mean(value) from /collectd.skia-status.df-mnt-pd0.df_complex-fre
e/ where time > now() - 5m;" | 385 query = "select mean(value) from /collectd.skia-status.df-mnt-pd0.df_complex-fre
e/ where time > now() - 5m;" |
| 386 category = "infra" | 386 category = "infra" |
| 387 condition = "x <= 5e9" | 387 condition = "x <= 5e9" |
| 388 actions = ["Email(infra-alerts@skia.org)"] | 388 actions = ["Email(infra-alerts@skia.org)"] |
| 389 auto-dismiss = true | 389 auto-dismiss = true |
| 390 nag = "1h" | 390 nag = "1h" |
| 391 | 391 |
| 392 [[rule]] |
| 393 name = "Low Disk Space (skia-ctfe)" |
| 394 message = "Free space has fallen below 1GB on skia-ctfe (root)." |
| 395 query = "select mean(value) from /collectd.skia-ctfe.df-root.df_complex-free/ wh
ere time > now() - 5m;" |
| 396 category = "infra" |
| 397 condition = "x <= 1e9" |
| 398 actions = ["Email(infra-alerts@skia.org)"] |
| 399 auto-dismiss = true |
| 400 nag = "1h" |
| 401 |
| 392 # | 402 # |
| 393 # Skia Status | 403 # Skia Status |
| 394 # | 404 # |
| 395 | 405 |
| 396 [[rule]] | 406 [[rule]] |
| 397 name = "Skia Status Prober (main page)" | 407 name = "Skia Status Prober (main page)" |
| 398 message = "The main page at https://status.skia.org has failed." | 408 message = "The main page at https://status.skia.org has failed." |
| 399 query = "select mean(value) from /prober.skiastatus.failure.value/ where time >
now() - 10m;" | 409 query = "select mean(value) from /prober.skiastatus.failure.value/ where time >
now() - 10m;" |
| 400 category = "infra" | 410 category = "infra" |
| 401 condition = "x >= 1" | 411 condition = "x >= 1" |
| (...skipping 115 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 517 | 527 |
| 518 [[rule]] | 528 [[rule]] |
| 519 name = "Datahopper Buildbot Ingestion Stalled (client.skia.fyi)" | 529 name = "Datahopper Buildbot Ingestion Stalled (client.skia.fyi)" |
| 520 message = "Buildbot Ingestion in Datahopper has failed to run in at least 10 min
utes for client.skia.fyi." | 530 message = "Buildbot Ingestion in Datahopper has failed to run in at least 10 min
utes for client.skia.fyi." |
| 521 query = "select mean(value) from /datahopper.skia-datahopper.buildbot-ingest-cli
ent.skia.fyi.time-since-last-successful-update.value/ where time > now() - 10m" | 531 query = "select mean(value) from /datahopper.skia-datahopper.buildbot-ingest-cli
ent.skia.fyi.time-since-last-successful-update.value/ where time > now() - 10m" |
| 522 category = "infra" | 532 category = "infra" |
| 523 condition = "x >= 600" | 533 condition = "x >= 600" |
| 524 actions = ["Email(infra-alerts@skia.org)"] | 534 actions = ["Email(infra-alerts@skia.org)"] |
| 525 auto-dismiss = true | 535 auto-dismiss = true |
| 526 nag = "1h" | 536 nag = "1h" |
| 537 |
| 538 # |
| 539 # CTFE |
| 540 # |
| 541 |
| 542 [[rule]] |
| 543 name = "CTFE Staging Prober (main page)" |
| 544 message = "The main page at http://ct-staging.skia.org is unavailable." |
| 545 query = "select mean(value) from /prober.ctfe_staging.failure.value/ where time
> now() - 10m;" |
| 546 category = "infra" |
| 547 condition = "x >= 1" |
| 548 actions = ["Email(infra-alerts@skia.org)"] |
| 549 auto-dismiss = true |
| 550 nag = "1h" |
| 551 |
| 552 [[rule]] |
| 553 name = "CTFE Staging Prober (chromium_perf)" |
| 554 message = "The page at https://ct-staging.skia.org/chromium_perf/ is unavailable
." |
| 555 query = "select mean(value) from /prober.ctfe_staging_chromium_perf.failure.valu
e/ where time > now() - 10m;" |
| 556 category = "infra" |
| 557 condition = "x >= 1" |
| 558 actions = ["Email(infra-alerts@skia.org)"] |
| 559 auto-dismiss = true |
| 560 nag = "1h" |
| 561 |
| 562 [[rule]] |
| 563 name = "CTFE Staging Prober (capture_skps)" |
| 564 message = "The page at https://ct-staging.skia.org/capture_skps/ is unavailable.
" |
| 565 query = "select mean(value) from /prober.ctfe_staging_capture_skps.failure.value
/ where time > now() - 10m;" |
| 566 category = "infra" |
| 567 condition = "x >= 1" |
| 568 actions = ["Email(infra-alerts@skia.org)"] |
| 569 auto-dismiss = true |
| 570 nag = "1h" |
| 571 |
| 572 [[rule]] |
| 573 name = "CTFE Staging Prober (lua_script)" |
| 574 message = "The page at https://ct-staging.skia.org/lua_script/ is unavailable." |
| 575 query = "select mean(value) from /prober.ctfe_staging_lua_script.failure.value/
where time > now() - 10m;" |
| 576 category = "infra" |
| 577 condition = "x >= 1" |
| 578 actions = ["Email(infra-alerts@skia.org)"] |
| 579 auto-dismiss = true |
| 580 nag = "1h" |
| 581 |
| 582 [[rule]] |
| 583 name = "CTFE Staging Prober (chromium_builds)" |
| 584 message = "The page at https://ct-staging.skia.org/chromium_builds/ is unavailab
le." |
| 585 query = "select mean(value) from /prober.ctfe_staging_chromium_builds.failure.va
lue/ where time > now() - 10m;" |
| 586 category = "infra" |
| 587 condition = "x >= 1" |
| 588 actions = ["Email(infra-alerts@skia.org)"] |
| 589 auto-dismiss = true |
| 590 nag = "1h" |
| 591 |
| 592 [[rule]] |
| 593 name = "CTFE Staging Prober (admin_tasks)" |
| 594 message = "The page at https://ct-staging.skia.org/admin_tasks/ is unavailable." |
| 595 query = "select mean(value) from /prober.ctfe_staging_admin_tasks.failure.value/
where time > now() - 10m;" |
| 596 category = "infra" |
| 597 condition = "x >= 1" |
| 598 actions = ["Email(infra-alerts@skia.org)"] |
| 599 auto-dismiss = true |
| 600 nag = "1h" |
| 601 |
| 602 [[rule]] |
| 603 name = "CTFE Staging Prober (queue)" |
| 604 message = "The page at https://ct-staging.skia.org/queue/ is unavailable." |
| 605 query = "select mean(value) from /prober.ctfe_staging_queue.failure.value/ where
time > now() - 10m;" |
| 606 category = "infra" |
| 607 condition = "x >= 1" |
| 608 actions = ["Email(infra-alerts@skia.org)"] |
| 609 auto-dismiss = true |
| 610 nag = "1h" |
| 611 |
| 612 [[rule]] |
| 613 name = "CTFE Staging Prober (history)" |
| 614 message = "The page at https://ct-staging.skia.org/history/ is unavailable." |
| 615 query = "select mean(value) from /prober.ctfe_staging_history.failure.value/ whe
re time > now() - 10m;" |
| 616 category = "infra" |
| 617 condition = "x >= 1" |
| 618 actions = ["Email(infra-alerts@skia.org)"] |
| 619 auto-dismiss = true |
| 620 nag = "1h" |
| 621 |
| 622 [[rule]] |
| 623 name = "CTFE Staging Prober (chromium_perf_runs)" |
| 624 message = "The page at https://ct-staging.skia.org/chromium_perf_runs/ is unavai
lable." |
| 625 query = "select mean(value) from /prober.ctfe_staging_chromium_perf_runs.failure
.value/ where time > now() - 10m;" |
| 626 category = "infra" |
| 627 condition = "x >= 1" |
| 628 actions = ["Email(infra-alerts@skia.org)"] |
| 629 auto-dismiss = true |
| 630 nag = "1h" |
| 631 |
| 632 [[rule]] |
| 633 name = "CTFE Staging Prober (capture_skp_runs)" |
| 634 message = "The page at https://ct-staging.skia.org/capture_skp_runs/ is unavaila
ble." |
| 635 query = "select mean(value) from /prober.ctfe_staging_capture_skp_runs.failure.v
alue/ where time > now() - 10m;" |
| 636 category = "infra" |
| 637 condition = "x >= 1" |
| 638 actions = ["Email(infra-alerts@skia.org)"] |
| 639 auto-dismiss = true |
| 640 nag = "1h" |
| 641 |
| 642 [[rule]] |
| 643 name = "CTFE Staging Prober (lua_script_runs)" |
| 644 message = "The page at https://ct-staging.skia.org/lua_script_runs/ is unavailab
le." |
| 645 query = "select mean(value) from /prober.ctfe_staging_lua_script_runs.failure.va
lue/ where time > now() - 10m;" |
| 646 category = "infra" |
| 647 condition = "x >= 1" |
| 648 actions = ["Email(infra-alerts@skia.org)"] |
| 649 auto-dismiss = true |
| 650 nag = "1h" |
| 651 |
| 652 [[rule]] |
| 653 name = "CTFE Staging Prober (chromium_builds_runs)" |
| 654 message = "The page at https://ct-staging.skia.org/chromium_builds_runs/ is unav
ailable." |
| 655 query = "select mean(value) from /prober.ctfe_staging_chromium_builds_runs.failu
re.value/ where time > now() - 10m;" |
| 656 category = "infra" |
| 657 condition = "x >= 1" |
| 658 actions = ["Email(infra-alerts@skia.org)"] |
| 659 auto-dismiss = true |
| 660 nag = "1h" |
| 661 |
| 662 [[rule]] |
| 663 name = "CTFE Staging Prober (recreate_page_sets_runs)" |
| 664 message = "The page at https://ct-staging.skia.org/recreate_page_sets_runs/ is u
navailable." |
| 665 query = "select mean(value) from /prober.ctfe_staging_recreate_page_sets_runs.fa
ilure.value/ where time > now() - 10m;" |
| 666 category = "infra" |
| 667 condition = "x >= 1" |
| 668 actions = ["Email(infra-alerts@skia.org)"] |
| 669 auto-dismiss = true |
| 670 nag = "1h" |
| 671 |
| 672 [[rule]] |
| 673 name = "CTFE Staging Prober (recreate_webpage_archives_runs)" |
| 674 message = "The page at https://ct-staging.skia.org/recreate_webpage_archives_run
s/ is unavailable." |
| 675 query = "select mean(value) from /prober.ctfe_staging_recreate_webpage_archives_
runs.failure.value/ where time > now() - 10m;" |
| 676 category = "infra" |
| 677 condition = "x >= 1" |
| 678 actions = ["Email(infra-alerts@skia.org)"] |
| 679 auto-dismiss = true |
| 680 nag = "1h" |
| 681 |
| 682 [[rule]] |
| 683 name = "CTFE Staging Prober (chromium_perf_parameters)" |
| 684 message = "The JSON endpoint at https://ct-staging.skia.org/_/chromium_perf/ is
unavailable or returning unexpected data." |
| 685 query = "select mean(value) from /prober.ctfe_staging_chromium_perf_parameters.f
ailure.value/ where time > now() - 10m;" |
| 686 category = "infra" |
| 687 condition = "x >= 1" |
| 688 actions = ["Email(infra-alerts@skia.org)"] |
| 689 auto-dismiss = true |
| 690 nag = "1h" |
| 691 |
| 692 [[rule]] |
| 693 name = "CTFE Staging Prober (chromium_rev_data)" |
| 694 message = "The JSON endpoint at https://ct-staging.skia.org/_/chromium_rev_data?
rev=LKGR is unavailable or returning unexpected data." |
| 695 query = "select mean(value) from /prober.ctfe_staging_chromium_rev_data.failure.
value/ where time > now() - 10m;" |
| 696 category = "infra" |
| 697 condition = "x >= 1" |
| 698 actions = ["Email(infra-alerts@skia.org)"] |
| 699 auto-dismiss = true |
| 700 nag = "1h" |
| 701 |
| 702 [[rule]] |
| 703 name = "CTFE Staging Prober (skia_rev_data)" |
| 704 message = "The JSON endpoint at https://ct-staging.skia.org/_/skia_rev_data?rev=
LKGR is unavailable or returning unexpected data." |
| 705 query = "select mean(value) from /prober.ctfe_staging_skia_rev_data.failure.valu
e/ where time > now() - 10m;" |
| 706 category = "infra" |
| 707 condition = "x >= 1" |
| 708 actions = ["Email(infra-alerts@skia.org)"] |
| 709 auto-dismiss = true |
| 710 nag = "1h" |
| 711 |
| 712 [[rule]] |
| 713 name = "CTFE Staging Prober (get_chromium_perf_tasks)" |
| 714 message = "The JSON endpoint at https://ct-staging.skia.org/_/get_chromium_perf_
tasks?size=2 is unavailable or returning unexpected data." |
| 715 query = "select mean(value) from /prober.ctfe_staging_get_chromium_perf_tasks.fa
ilure.value/ where time > now() - 10m;" |
| 716 category = "infra" |
| 717 condition = "x >= 1" |
| 718 actions = ["Email(infra-alerts@skia.org)"] |
| 719 auto-dismiss = true |
| 720 nag = "1h" |
| 721 |
| 722 [[rule]] |
| 723 name = "CTFE Staging Prober (get_capture_skp_tasks)" |
| 724 message = "The JSON endpoint at https://ct-staging.skia.org/_/get_capture_skp_ta
sks?size=2 is unavailable or returning unexpected data." |
| 725 query = "select mean(value) from /prober.ctfe_staging_get_capture_skp_tasks.fail
ure.value/ where time > now() - 10m;" |
| 726 category = "infra" |
| 727 condition = "x >= 1" |
| 728 actions = ["Email(infra-alerts@skia.org)"] |
| 729 auto-dismiss = true |
| 730 nag = "1h" |
| 731 |
| 732 [[rule]] |
| 733 name = "CTFE Staging Prober (get_lua_script_tasks)" |
| 734 message = "The JSON endpoint at https://ct-staging.skia.org/_/get_lua_script_tas
ks?size=2 is unavailable or returning unexpected data." |
| 735 query = "select mean(value) from /prober.ctfe_staging_get_lua_script_tasks.failu
re.value/ where time > now() - 10m;" |
| 736 category = "infra" |
| 737 condition = "x >= 1" |
| 738 actions = ["Email(infra-alerts@skia.org)"] |
| 739 auto-dismiss = true |
| 740 nag = "1h" |
| 741 |
| 742 [[rule]] |
| 743 name = "CTFE Staging Prober (get_chromium_build_tasks)" |
| 744 message = "The JSON endpoint at https://ct-staging.skia.org/_/get_chromium_build
_tasks?size=2 is unavailable or returning unexpected data." |
| 745 query = "select mean(value) from /prober.ctfe_staging_get_chromium_build_tasks.f
ailure.value/ where time > now() - 10m;" |
| 746 category = "infra" |
| 747 condition = "x >= 1" |
| 748 actions = ["Email(infra-alerts@skia.org)"] |
| 749 auto-dismiss = true |
| 750 nag = "1h" |
| 751 |
| 752 [[rule]] |
| 753 name = "CTFE Staging Prober (get_recreate_page_sets_tasks)" |
| 754 message = "The JSON endpoint at https://ct-staging.skia.org/_/get_recreate_page_
sets_tasks?size=2 is unavailable or returning unexpected data." |
| 755 query = "select mean(value) from /prober.ctfe_staging_get_recreate_page_sets_tas
ks.failure.value/ where time > now() - 10m;" |
| 756 category = "infra" |
| 757 condition = "x >= 1" |
| 758 actions = ["Email(infra-alerts@skia.org)"] |
| 759 auto-dismiss = true |
| 760 nag = "1h" |
| 761 |
| 762 [[rule]] |
| 763 name = "CTFE Staging Prober (get_recreate_webpage_archives_tasks)" |
| 764 message = "The JSON endpoint at https://ct-staging.skia.org/_/get_recreate_webpa
ge_archives_tasks?size=2 is unavailable or returning unexpected data." |
| 765 query = "select mean(value) from /prober.ctfe_staging_get_recreate_webpage_archi
ves_tasks.failure.value/ where time > now() - 10m;" |
| 766 category = "infra" |
| 767 condition = "x >= 1" |
| 768 actions = ["Email(infra-alerts@skia.org)"] |
| 769 auto-dismiss = true |
| 770 nag = "1h" |
| 771 |
| 772 [[rule]] |
| 773 name = "CTFE Staging Prober (get_oldest_pending_task)" |
| 774 message = "The JSON endpoint at https://ct-staging.skia.org/_/get_oldest_pending
_task is unavailable or returning unexpected data." |
| 775 query = "select mean(value) from /prober.ctfe_staging_get_oldest_pending_task.fa
ilure.value/ where time > now() - 10m;" |
| 776 category = "infra" |
| 777 condition = "x >= 1" |
| 778 actions = ["Email(infra-alerts@skia.org)"] |
| 779 auto-dismiss = true |
| 780 nag = "1h" |
| 781 |
| 782 [[rule]] |
| 783 name = "CTFE Staging Prober (any_skp_repository_available)" |
| 784 message = "There are no SKP repositories available for running Lua scripts." |
| 785 query = "select mean(value) from /prober.ctfe_staging_any_skp_repository_availab
le.failure.value/ where time > now() - 60m;" |
| 786 category = "infra" |
| 787 condition = "x >= 1" |
| 788 actions = ["Email(infra-alerts@skia.org)"] |
| 789 auto-dismiss = true |
| 790 nag = "24h" |
| 791 |
| 792 [[rule]] |
| 793 name = "CTFE Staging Prober (any_chromium_builds_available)" |
| 794 message = "There are no Chromium builds available for running tasks." |
| 795 query = "select mean(value) from /prober.ctfe_staging_any_chromium_builds_availa
ble.failure.value/ where time > now() - 60m;" |
| 796 category = "infra" |
| 797 condition = "x >= 1" |
| 798 actions = ["Email(infra-alerts@skia.org)"] |
| 799 auto-dismiss = true |
| 800 nag = "24h" |
| 801 |
| 802 [[rule]] |
| 803 name = "CTFE Staging Pending Task Count" |
| 804 message = "There are a lot of pending tasks." |
| 805 query = "select mean(value) from /ctfe.ct-staging-skia-org.num-pending-tasks.val
ue/ where time > now() - 10m;" |
| 806 category = "infra" |
| 807 condition = "x >= 100" |
| 808 actions = ["Email(infra-alerts@skia.org)"] |
| 809 auto-dismiss = true |
| 810 nag = "1h" |
| 811 |
| 812 [[rule]] |
| 813 name = "CTFE Staging Pending Task Status" |
| 814 message = "A task has been waiting to be executed for a while and it's still not
started." |
| 815 query = "select mean(value) from /ctfe.ct-staging-skia-org.oldest-pending-task-s
tatus.value/ where time > now() - 10m;" |
| 816 category = "infra" |
| 817 condition = "x >= 2" |
| 818 actions = ["Email(infra-alerts@skia.org)"] |
| 819 auto-dismiss = true |
| 820 nag = "1h" |
| 821 |
| 822 [[rule]] |
| 823 name = "CTFE Staging Last Metrics Update" |
| 824 message = "No recent update from the CTFE metrics goroutine." |
| 825 query = "select count(value) from /ctfe.ct-staging-skia-org.oldest-pending-task-
status.value/ where time > now() - 10m;" |
| 826 category = "infra" |
| 827 condition = "x < 1" |
| 828 actions = ["Email(infra-alerts@skia.org)"] |
| 829 auto-dismiss = true |
| 830 nag = "1h" |
| 831 |
| 832 [[rule]] |
| 833 name = "CTFE Staging Number of Goroutines" |
| 834 message = "There are more goroutines running than expected." |
| 835 query = "select mean(value) from /ctfe.ct-staging-skia-org.runtime.NumGoroutine.
value/ where time > now() - 10m;" |
| 836 category = "infra" |
| 837 condition = "x >= 100" |
| 838 actions = ["Email(infra-alerts@skia.org)"] |
| 839 auto-dismiss = true |
| 840 nag = "1h" |
| 841 |
| 842 [[rule]] |
| 843 name = "CTFE Staging Error Rate" |
| 844 message = "The error rate is too high." |
| 845 query = "select derivative(value) from /^logserver.skia-ctfe.skia-ctfe.logserver
.ctfe.ERROR.value$/ where time > now() - 10m" |
| 846 category = "infra" |
| 847 condition = "x >= 5" |
| 848 actions = ["Email(infra-alerts@skia.org)"] |
| 849 auto-dismiss = false |
| 850 nag = "1h" |
| OLD | NEW |