From f7d73a0df5282f5358a6e3d6d5ff90e7fa97ea78 Mon Sep 17 00:00:00 2001 From: nyyu Date: Mon, 6 Jan 2025 08:45:05 +0100 Subject: [PATCH] feat: alerts --- conf/prometheus/alerts/blackbox-exporter.yml | 86 ++++++ conf/prometheus/alerts/blackbox.yml | 74 ----- conf/prometheus/alerts/cadvisor.yml | 68 ----- conf/prometheus/alerts/embedded-exporter.yml | 257 ++++++++++++++++++ conf/prometheus/alerts/google-cadvisor.yml | 77 ++++++ conf/prometheus/alerts/kubestate-exporter.yml | 135 ++++----- conf/prometheus/alerts/mysqld-exporter.yml | 9 + .../alerts/{node.yml => node-exporter.yml} | 158 +++++++---- conf/prometheus/alerts/postgres-exporter.yml | 194 +++++++++++++ conf/prometheus/alerts/prometheus.yml | 236 ---------------- 10 files changed, 795 insertions(+), 499 deletions(-) create mode 100644 conf/prometheus/alerts/blackbox-exporter.yml delete mode 100644 conf/prometheus/alerts/blackbox.yml delete mode 100644 conf/prometheus/alerts/cadvisor.yml create mode 100644 conf/prometheus/alerts/embedded-exporter.yml create mode 100644 conf/prometheus/alerts/google-cadvisor.yml rename conf/prometheus/alerts/{node.yml => node-exporter.yml} (52%) create mode 100644 conf/prometheus/alerts/postgres-exporter.yml delete mode 100644 conf/prometheus/alerts/prometheus.yml diff --git a/conf/prometheus/alerts/blackbox-exporter.yml b/conf/prometheus/alerts/blackbox-exporter.yml new file mode 100644 index 0000000..3f90436 --- /dev/null +++ b/conf/prometheus/alerts/blackbox-exporter.yml @@ -0,0 +1,86 @@ +groups: + +- name: BlackboxExporter + + rules: + + - alert: BlackboxProbeFailed + expr: 'probe_success == 0' + for: 0m + labels: + severity: critical + annotations: + summary: Blackbox probe failed (instance {{ $labels.instance }}) + description: "Probe failed\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: BlackboxConfigurationReloadFailure + expr: 'blackbox_exporter_config_last_reload_successful != 1' + for: 0m + labels: + severity: warning + annotations: + summary: Blackbox configuration reload failure (instance {{ $labels.instance }}) + description: "Blackbox configuration reload failure\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: BlackboxSlowProbe + expr: 'avg_over_time(probe_duration_seconds[1m]) > 1' + for: 1m + labels: + severity: warning + annotations: + summary: Blackbox slow probe (instance {{ $labels.instance }}) + description: "Blackbox probe took more than 1s to complete\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: BlackboxProbeHttpFailure + expr: 'probe_http_status_code <= 199 OR probe_http_status_code >= 400' + for: 0m + labels: + severity: critical + annotations: + summary: Blackbox probe HTTP failure (instance {{ $labels.instance }}) + description: "HTTP status code is not 200-399\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: BlackboxSslCertificateWillExpireSoon + expr: '3 <= round((last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) / 86400, 0.1) < 20' + for: 0m + labels: + severity: warning + annotations: + summary: Blackbox SSL certificate will expire soon (instance {{ $labels.instance }}) + description: "SSL certificate expires in less than 20 days\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: BlackboxSslCertificateWillExpireSoon + expr: '0 <= round((last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) / 86400, 0.1) < 3' + for: 0m + labels: + severity: critical + annotations: + summary: Blackbox SSL certificate will expire soon (instance {{ $labels.instance }}) + description: "SSL certificate expires in less than 3 days\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: BlackboxSslCertificateExpired + expr: 'round((last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) / 86400, 0.1) < 0' + for: 0m + labels: + severity: critical + annotations: + summary: Blackbox SSL certificate expired (instance {{ $labels.instance }}) + description: "SSL certificate has expired already\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: BlackboxProbeSlowHttp + expr: 'avg_over_time(probe_http_duration_seconds[1m]) > 1' + for: 1m + labels: + severity: warning + annotations: + summary: Blackbox probe slow HTTP (instance {{ $labels.instance }}) + description: "HTTP request took more than 1s\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: BlackboxProbeSlowPing + expr: 'avg_over_time(probe_icmp_duration_seconds[1m]) > 1' + for: 1m + labels: + severity: warning + annotations: + summary: Blackbox probe slow ping (instance {{ $labels.instance }}) + description: "Blackbox ping took more than 1s\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" diff --git a/conf/prometheus/alerts/blackbox.yml b/conf/prometheus/alerts/blackbox.yml deleted file mode 100644 index 1206d85..0000000 --- a/conf/prometheus/alerts/blackbox.yml +++ /dev/null @@ -1,74 +0,0 @@ -groups: - - name: blackbox - rules: - - alert: BlackboxProbeFailed - expr: probe_success == 0 - for: 0m - labels: - severity: critical - annotations: - summary: Blackbox probe failed (instance {{ $labels.instance }}) - description: "Probe failed\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - - alert: BlackboxSlowProbe - expr: avg_over_time(probe_duration_seconds[1m]) > 2 - for: 1m - labels: - severity: warning - annotations: - summary: Blackbox slow probe (instance {{ $labels.instance }}) - description: "Blackbox probe took more than 2s to complete\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - - alert: BlackboxProbeHttpFailure - expr: probe_http_status_code <= 199 OR probe_http_status_code >= 400 - for: 0m - labels: - severity: critical - annotations: - summary: Blackbox probe HTTP failure (instance {{ $labels.instance }}) - description: "HTTP status code is not 200-399\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - - alert: BlackboxSslCertificateWillExpireSoon - expr: probe_ssl_earliest_cert_expiry - time() < 86400 * 30 - for: 0m - labels: - severity: warning - annotations: - summary: Blackbox SSL certificate will expire soon (instance {{ $labels.instance }}) - description: "SSL certificate expires in 30 days\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - - alert: BlackboxSslCertificateWillExpireSoon - expr: probe_ssl_earliest_cert_expiry - time() < 86400 * 3 - for: 0m - labels: - severity: critical - annotations: - summary: Blackbox SSL certificate will expire soon (instance {{ $labels.instance }}) - description: "SSL certificate expires in 3 days\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - - alert: BlackboxSslCertificateExpired - expr: probe_ssl_earliest_cert_expiry - time() <= 0 - for: 0m - labels: - severity: critical - annotations: - summary: Blackbox SSL certificate expired (instance {{ $labels.instance }}) - description: "SSL certificate has expired already\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - - alert: BlackboxProbeSlowHttp - expr: avg_over_time(probe_http_duration_seconds[1m]) > 2 - for: 1m - labels: - severity: warning - annotations: - summary: Blackbox probe slow HTTP (instance {{ $labels.instance }}) - description: "HTTP request took more than 2s\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - - alert: BlackboxProbeSlowPing - expr: avg_over_time(probe_icmp_duration_seconds[1m]) > 2 - for: 1m - labels: - severity: warning - annotations: - summary: Blackbox probe slow ping (instance {{ $labels.instance }}) - description: "Blackbox ping took more than 2s\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" diff --git a/conf/prometheus/alerts/cadvisor.yml b/conf/prometheus/alerts/cadvisor.yml deleted file mode 100644 index b3bc5c4..0000000 --- a/conf/prometheus/alerts/cadvisor.yml +++ /dev/null @@ -1,68 +0,0 @@ -groups: - - name: cadvisor - rules: - # This rule can be very noisy in dynamic infra with legitimate container start/stop/deployment. - - alert: ContainerKilled - expr: time() - container_last_seen > 60 - for: 0m - labels: - severity: warning - annotations: - summary: Container killed (instance {{ $labels.instance }}) - description: "A container has disappeared\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - # This rule can be very noisy in dynamic infra with legitimate container start/stop/deployment. - - alert: ContainerAbsent - expr: absent(container_last_seen) - for: 5m - labels: - severity: warning - annotations: - summary: Container absent (instance {{ $labels.instance }}) - description: "A container is absent for 5 min\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - - alert: ContainerCpuUsage - expr: (sum(rate(container_cpu_usage_seconds_total{name!=""}[3m])) BY (instance, name) * 100) > 80 - for: 2m - labels: - severity: warning - annotations: - summary: Container CPU usage (instance {{ $labels.instance }}) - description: "Container CPU usage is above 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - # See https://medium.com/faun/how-much-is-too-much-the-linux-oomkiller-and-used-memory-d32186f29c9d - - alert: ContainerMemoryUsage - expr: (sum(container_memory_working_set_bytes{name!=""}) BY (instance, name) / sum(container_spec_memory_limit_bytes > 0) BY (instance, name) * 100) > 80 - for: 2m - labels: - severity: warning - annotations: - summary: Container Memory usage (instance {{ $labels.instance }}) - description: "Container Memory usage is above 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - - alert: ContainerVolumeUsage - expr: (1 - (sum(container_fs_inodes_free{name!=""}) BY (instance) / sum(container_fs_inodes_total) BY (instance))) * 100 > 80 - for: 2m - labels: - severity: warning - annotations: - summary: Container Volume usage (instance {{ $labels.instance }}) - description: "Container Volume usage is above 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - - alert: ContainerVolumeIoUsage - expr: (sum(container_fs_io_current{name!=""}) BY (instance, name) * 100) > 80 - for: 2m - labels: - severity: warning - annotations: - summary: Container Volume IO usage (instance {{ $labels.instance }}) - description: "Container Volume IO usage is above 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - - alert: ContainerHighThrottleRate - expr: rate(container_cpu_cfs_throttled_seconds_total[3m]) > 1 - for: 2m - labels: - severity: warning - annotations: - summary: Container high throttle rate (instance {{ $labels.instance }}) - description: "Container is being throttled\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" diff --git a/conf/prometheus/alerts/embedded-exporter.yml b/conf/prometheus/alerts/embedded-exporter.yml new file mode 100644 index 0000000..65bfd82 --- /dev/null +++ b/conf/prometheus/alerts/embedded-exporter.yml @@ -0,0 +1,257 @@ +groups: + +- name: EmbeddedExporter + + rules: + + - alert: PrometheusJobMissing + expr: 'absent(up{job="prometheus"})' + for: 0m + labels: + severity: warning + annotations: + summary: Prometheus job missing (instance {{ $labels.instance }}) + description: "A Prometheus job has disappeared\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusTargetMissing + expr: 'up == 0' + for: 0m + labels: + severity: critical + annotations: + summary: Prometheus target missing (instance {{ $labels.instance }}) + description: "A Prometheus target has disappeared. An exporter might be crashed.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusAllTargetsMissing + expr: 'sum by (job) (up) == 0' + for: 0m + labels: + severity: critical + annotations: + summary: Prometheus all targets missing (instance {{ $labels.instance }}) + description: "A Prometheus job does not have living target anymore.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusTargetMissingWithWarmupTime + expr: 'sum by (instance, job) ((up == 0) * on (instance) group_right(job) (node_time_seconds - node_boot_time_seconds > 600))' + for: 0m + labels: + severity: critical + annotations: + summary: Prometheus target missing with warmup time (instance {{ $labels.instance }}) + description: "Allow a job time to start up (10 minutes) before alerting that it's down.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusConfigurationReloadFailure + expr: 'prometheus_config_last_reload_successful != 1' + for: 0m + labels: + severity: warning + annotations: + summary: Prometheus configuration reload failure (instance {{ $labels.instance }}) + description: "Prometheus configuration reload error\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusTooManyRestarts + expr: 'changes(process_start_time_seconds{job=~"prometheus|pushgateway|alertmanager"}[15m]) > 2' + for: 0m + labels: + severity: warning + annotations: + summary: Prometheus too many restarts (instance {{ $labels.instance }}) + description: "Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusAlertmanagerJobMissing + expr: 'absent(up{job="alertmanager"})' + for: 0m + labels: + severity: warning + annotations: + summary: Prometheus AlertManager job missing (instance {{ $labels.instance }}) + description: "A Prometheus AlertManager job has disappeared\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusAlertmanagerConfigurationReloadFailure + expr: 'alertmanager_config_last_reload_successful != 1' + for: 0m + labels: + severity: warning + annotations: + summary: Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }}) + description: "AlertManager configuration reload error\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusAlertmanagerConfigNotSynced + expr: 'count(count_values("config_hash", alertmanager_config_hash)) > 1' + for: 0m + labels: + severity: warning + annotations: + summary: Prometheus AlertManager config not synced (instance {{ $labels.instance }}) + description: "Configurations of AlertManager cluster instances are out of sync\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusAlertmanagerE2eDeadManSwitch + expr: 'vector(1)' + for: 0m + labels: + severity: critical + annotations: + summary: Prometheus AlertManager E2E dead man switch (instance {{ $labels.instance }}) + description: "Prometheus DeadManSwitch is an always-firing alert. It's used as an end-to-end test of Prometheus through the Alertmanager.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusNotConnectedToAlertmanager + expr: 'prometheus_notifications_alertmanagers_discovered < 1' + for: 0m + labels: + severity: critical + annotations: + summary: Prometheus not connected to alertmanager (instance {{ $labels.instance }}) + description: "Prometheus cannot connect the alertmanager\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusRuleEvaluationFailures + expr: 'increase(prometheus_rule_evaluation_failures_total[3m]) > 0' + for: 0m + labels: + severity: critical + annotations: + summary: Prometheus rule evaluation failures (instance {{ $labels.instance }}) + description: "Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusTemplateTextExpansionFailures + expr: 'increase(prometheus_template_text_expansion_failures_total[3m]) > 0' + for: 0m + labels: + severity: critical + annotations: + summary: Prometheus template text expansion failures (instance {{ $labels.instance }}) + description: "Prometheus encountered {{ $value }} template text expansion failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusRuleEvaluationSlow + expr: 'prometheus_rule_group_last_duration_seconds > prometheus_rule_group_interval_seconds' + for: 5m + labels: + severity: warning + annotations: + summary: Prometheus rule evaluation slow (instance {{ $labels.instance }}) + description: "Prometheus rule evaluation took more time than the scheduled interval. It indicates a slower storage backend access or too complex query.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusNotificationsBacklog + expr: 'min_over_time(prometheus_notifications_queue_length[10m]) > 0' + for: 0m + labels: + severity: warning + annotations: + summary: Prometheus notifications backlog (instance {{ $labels.instance }}) + description: "The Prometheus notification queue has not been empty for 10 minutes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusAlertmanagerNotificationFailing + expr: 'rate(alertmanager_notifications_failed_total[1m]) > 0' + for: 0m + labels: + severity: critical + annotations: + summary: Prometheus AlertManager notification failing (instance {{ $labels.instance }}) + description: "Alertmanager is failing sending notifications\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusTargetEmpty + expr: 'prometheus_sd_discovered_targets == 0' + for: 0m + labels: + severity: critical + annotations: + summary: Prometheus target empty (instance {{ $labels.instance }}) + description: "Prometheus has no target in service discovery\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusTargetScrapingSlow + expr: 'prometheus_target_interval_length_seconds{quantile="0.9"} / on (interval, instance, job) prometheus_target_interval_length_seconds{quantile="0.5"} > 1.05' + for: 5m + labels: + severity: warning + annotations: + summary: Prometheus target scraping slow (instance {{ $labels.instance }}) + description: "Prometheus is scraping exporters slowly since it exceeded the requested interval time. Your Prometheus server is under-provisioned.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusLargeScrape + expr: 'increase(prometheus_target_scrapes_exceeded_sample_limit_total[10m]) > 10' + for: 5m + labels: + severity: warning + annotations: + summary: Prometheus large scrape (instance {{ $labels.instance }}) + description: "Prometheus has many scrapes that exceed the sample limit\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusTargetScrapeDuplicate + expr: 'increase(prometheus_target_scrapes_sample_duplicate_timestamp_total[5m]) > 0' + for: 0m + labels: + severity: warning + annotations: + summary: Prometheus target scrape duplicate (instance {{ $labels.instance }}) + description: "Prometheus has many samples rejected due to duplicate timestamps but different values\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusTsdbCheckpointCreationFailures + expr: 'increase(prometheus_tsdb_checkpoint_creations_failed_total[1m]) > 0' + for: 0m + labels: + severity: critical + annotations: + summary: Prometheus TSDB checkpoint creation failures (instance {{ $labels.instance }}) + description: "Prometheus encountered {{ $value }} checkpoint creation failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusTsdbCheckpointDeletionFailures + expr: 'increase(prometheus_tsdb_checkpoint_deletions_failed_total[1m]) > 0' + for: 0m + labels: + severity: critical + annotations: + summary: Prometheus TSDB checkpoint deletion failures (instance {{ $labels.instance }}) + description: "Prometheus encountered {{ $value }} checkpoint deletion failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusTsdbCompactionsFailed + expr: 'increase(prometheus_tsdb_compactions_failed_total[1m]) > 0' + for: 0m + labels: + severity: critical + annotations: + summary: Prometheus TSDB compactions failed (instance {{ $labels.instance }}) + description: "Prometheus encountered {{ $value }} TSDB compactions failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusTsdbHeadTruncationsFailed + expr: 'increase(prometheus_tsdb_head_truncations_failed_total[1m]) > 0' + for: 0m + labels: + severity: critical + annotations: + summary: Prometheus TSDB head truncations failed (instance {{ $labels.instance }}) + description: "Prometheus encountered {{ $value }} TSDB head truncation failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusTsdbReloadFailures + expr: 'increase(prometheus_tsdb_reloads_failures_total[1m]) > 0' + for: 0m + labels: + severity: critical + annotations: + summary: Prometheus TSDB reload failures (instance {{ $labels.instance }}) + description: "Prometheus encountered {{ $value }} TSDB reload failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusTsdbWalCorruptions + expr: 'increase(prometheus_tsdb_wal_corruptions_total[1m]) > 0' + for: 0m + labels: + severity: critical + annotations: + summary: Prometheus TSDB WAL corruptions (instance {{ $labels.instance }}) + description: "Prometheus encountered {{ $value }} TSDB WAL corruptions\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusTsdbWalTruncationsFailed + expr: 'increase(prometheus_tsdb_wal_truncations_failed_total[1m]) > 0' + for: 0m + labels: + severity: critical + annotations: + summary: Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }}) + description: "Prometheus encountered {{ $value }} TSDB WAL truncation failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PrometheusTimeseriesCardinality + expr: 'label_replace(count by(__name__) ({__name__=~".+"}), "name", "$1", "__name__", "(.+)") > 10000' + for: 0m + labels: + severity: warning + annotations: + summary: Prometheus timeseries cardinality (instance {{ $labels.instance }}) + description: "The \"{{ $labels.name }}\" timeseries cardinality is getting very high: {{ $value }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" diff --git a/conf/prometheus/alerts/google-cadvisor.yml b/conf/prometheus/alerts/google-cadvisor.yml new file mode 100644 index 0000000..cfbc333 --- /dev/null +++ b/conf/prometheus/alerts/google-cadvisor.yml @@ -0,0 +1,77 @@ +groups: + +- name: GoogleCadvisor + + rules: + + - alert: ContainerKilled + expr: 'time() - container_last_seen > 60' + for: 0m + labels: + severity: warning + annotations: + summary: Container killed (instance {{ $labels.instance }}) + description: "A container has disappeared\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: ContainerAbsent + expr: 'absent(container_last_seen)' + for: 5m + labels: + severity: warning + annotations: + summary: Container absent (instance {{ $labels.instance }}) + description: "A container is absent for 5 min\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: ContainerHighCpuUtilization + expr: '(sum(rate(container_cpu_usage_seconds_total{container!=""}[5m])) by (pod, container) / sum(container_spec_cpu_quota{container!=""}/container_spec_cpu_period{container!=""}) by (pod, container) * 100) > 80' + for: 2m + labels: + severity: warning + annotations: + summary: Container High CPU utilization (instance {{ $labels.instance }}) + description: "Container CPU utilization is above 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: ContainerHighMemoryUsage + expr: '(sum(container_memory_working_set_bytes{name!=""}) BY (instance, name) / sum(container_spec_memory_limit_bytes > 0) BY (instance, name) * 100) > 80' + for: 2m + labels: + severity: warning + annotations: + summary: Container High Memory usage (instance {{ $labels.instance }}) + description: "Container Memory usage is above 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: ContainerVolumeUsage + expr: '(1 - (sum(container_fs_inodes_free{name!=""}) BY (instance) / sum(container_fs_inodes_total) BY (instance))) * 100 > 80' + for: 2m + labels: + severity: warning + annotations: + summary: Container Volume usage (instance {{ $labels.instance }}) + description: "Container Volume usage is above 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: ContainerHighThrottleRate + expr: 'sum(increase(container_cpu_cfs_throttled_periods_total{container!=""}[5m])) by (container, pod, namespace) / sum(increase(container_cpu_cfs_periods_total[5m])) by (container, pod, namespace) > ( 25 / 100 )' + for: 5m + labels: + severity: warning + annotations: + summary: Container high throttle rate (instance {{ $labels.instance }}) + description: "Container is being throttled\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: ContainerLowCpuUtilization + expr: '(sum(rate(container_cpu_usage_seconds_total{container!=""}[5m])) by (pod, container) / sum(container_spec_cpu_quota{container!=""}/container_spec_cpu_period{container!=""}) by (pod, container) * 100) < 20' + for: 7d + labels: + severity: info + annotations: + summary: Container Low CPU utilization (instance {{ $labels.instance }}) + description: "Container CPU utilization is under 20% for 1 week. Consider reducing the allocated CPU.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: ContainerLowMemoryUsage + expr: '(sum(container_memory_working_set_bytes{name!=""}) BY (instance, name) / sum(container_spec_memory_limit_bytes > 0) BY (instance, name) * 100) < 20' + for: 7d + labels: + severity: info + annotations: + summary: Container Low Memory usage (instance {{ $labels.instance }}) + description: "Container Memory usage is under 20% for 1 week. Consider reducing the allocated memory.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" diff --git a/conf/prometheus/alerts/kubestate-exporter.yml b/conf/prometheus/alerts/kubestate-exporter.yml index 97675bb..7e32694 100644 --- a/conf/prometheus/alerts/kubestate-exporter.yml +++ b/conf/prometheus/alerts/kubestate-exporter.yml @@ -4,50 +4,50 @@ groups: rules: - - alert: KubernetesNodeReady + - alert: KubernetesNodeNotReady expr: 'kube_node_status_condition{condition="Ready",status="true"} == 0' for: 10m labels: severity: critical annotations: - summary: Kubernetes Node ready (instance {{ $labels.instance }}) + summary: Kubernetes Node ready (node {{ $labels.node }}) description: "Node {{ $labels.node }} has been unready for a long time\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - alert: KubernetesMemoryPressure + - alert: KubernetesNodeMemoryPressure expr: 'kube_node_status_condition{condition="MemoryPressure",status="true"} == 1' for: 2m labels: severity: critical annotations: - summary: Kubernetes memory pressure (instance {{ $labels.instance }}) - description: "{{ $labels.node }} has MemoryPressure condition\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + summary: Kubernetes memory pressure (node {{ $labels.node }}) + description: "Node {{ $labels.node }} has MemoryPressure condition\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - alert: KubernetesDiskPressure + - alert: KubernetesNodeDiskPressure expr: 'kube_node_status_condition{condition="DiskPressure",status="true"} == 1' for: 2m labels: severity: critical annotations: - summary: Kubernetes disk pressure (instance {{ $labels.instance }}) - description: "{{ $labels.node }} has DiskPressure condition\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + summary: Kubernetes disk pressure (node {{ $labels.node }}) + description: "Node {{ $labels.node }} has DiskPressure condition\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - alert: KubernetesNetworkUnavailable + - alert: KubernetesNodeNetworkUnavailable expr: 'kube_node_status_condition{condition="NetworkUnavailable",status="true"} == 1' for: 2m labels: severity: critical annotations: - summary: Kubernetes network unavailable (instance {{ $labels.instance }}) - description: "{{ $labels.node }} has NetworkUnavailable condition\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + summary: Kubernetes Node network unavailable (instance {{ $labels.instance }}) + description: "Node {{ $labels.node }} has NetworkUnavailable condition\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - alert: KubernetesOutOfCapacity - expr: 'sum by (node) ((kube_pod_status_phase{phase="Running"} == 1) + on(uid) group_left(node) (0 * kube_pod_info{pod_template_hash=""})) / sum by (node) (kube_node_status_allocatable{resource="pods"}) * 100 > 90' + - alert: KubernetesNodeOutOfPodCapacity + expr: 'sum by (node) ((kube_pod_status_phase{phase="Running"} == 1) + on(uid, instance) group_left(node) (0 * kube_pod_info{pod_template_hash=""})) / sum by (node) (kube_node_status_allocatable{resource="pods"}) * 100 > 90' for: 2m labels: severity: warning annotations: - summary: Kubernetes out of capacity (instance {{ $labels.instance }}) - description: "{{ $labels.node }} is out of capacity\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + summary: Kubernetes Node out of pod capacity (instance {{ $labels.instance }}) + description: "Node {{ $labels.node }} is out of pod capacity\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: KubernetesContainerOomKiller expr: '(kube_pod_container_status_restarts_total - kube_pod_container_status_restarts_total offset 10m >= 1) and ignoring (reason) min_over_time(kube_pod_container_status_last_terminated_reason{reason="OOMKilled"}[10m]) == 1' @@ -55,7 +55,7 @@ groups: labels: severity: warning annotations: - summary: Kubernetes container oom killer (instance {{ $labels.instance }}) + summary: Kubernetes container oom killer ({{ $labels.namespace }}/{{ $labels.pod }}:{{ $labels.container }}) description: "Container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }} has been OOMKilled {{ $value }} times in the last 10 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: KubernetesJobFailed @@ -64,8 +64,17 @@ groups: labels: severity: warning annotations: - summary: Kubernetes Job failed (instance {{ $labels.instance }}) - description: "Job {{ $labels.namespace }}/{{ $labels.exported_job }} failed to complete\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + summary: Kubernetes Job failed ({{ $labels.namespace }}/{{ $labels.job_name }}) + description: "Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: KubernetesJobNotStarting + expr: 'kube_job_status_active == 0 and kube_job_status_failed == 0 and kube_job_status_succeeded == 0 and (time() - kube_job_status_start_time) > 600' + for: 0m + labels: + severity: warning + annotations: + summary: Kubernetes Job not starting ({{ $labels.namespace }}/{{ $labels.job_name }}) + description: "Job {{ $labels.namespace }}/{{ $labels.job_name }} did not start for 10 minutes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: KubernetesCronjobSuspended expr: 'kube_cronjob_spec_suspend != 0' @@ -73,7 +82,7 @@ groups: labels: severity: warning annotations: - summary: Kubernetes CronJob suspended (instance {{ $labels.instance }}) + summary: Kubernetes CronJob suspended ({{ $labels.namespace }}/{{ $labels.cronjob }}) description: "CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is suspended\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: KubernetesPersistentvolumeclaimPending @@ -82,7 +91,7 @@ groups: labels: severity: warning annotations: - summary: Kubernetes PersistentVolumeClaim pending (instance {{ $labels.instance }}) + summary: Kubernetes PersistentVolumeClaim pending ({{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }}) description: "PersistentVolumeClaim {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is pending\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: KubernetesVolumeOutOfDiskSpace @@ -95,13 +104,13 @@ groups: description: "Volume is almost full (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: KubernetesVolumeFullInFourDays - expr: 'predict_linear(kubelet_volume_stats_available_bytes[6h], 4 * 24 * 3600) < 0' + expr: 'predict_linear(kubelet_volume_stats_available_bytes[6h:5m], 4 * 24 * 3600) < 0' for: 0m labels: severity: critical annotations: summary: Kubernetes Volume full in four days (instance {{ $labels.instance }}) - description: "{{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is expected to fill up within four days. Currently {{ $value | humanize }}% is available.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "Volume under {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is expected to fill up within four days. Currently {{ $value | humanize }}% is available.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: KubernetesPersistentvolumeError expr: 'kube_persistentvolume_status_phase{phase=~"Failed|Pending", job="kube-state-metrics"} > 0' @@ -109,8 +118,8 @@ groups: labels: severity: critical annotations: - summary: Kubernetes PersistentVolume error (instance {{ $labels.instance }}) - description: "Persistent volume is in bad state\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + summary: Kubernetes PersistentVolumeClaim pending ({{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }}) + description: "Persistent volume {{ $labels.persistentvolume }} is in bad state\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: KubernetesStatefulsetDown expr: 'kube_statefulset_replicas != kube_statefulset_status_replicas_ready > 0' @@ -118,35 +127,35 @@ groups: labels: severity: critical annotations: - summary: Kubernetes StatefulSet down (instance {{ $labels.instance }}) - description: "A StatefulSet went down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + summary: Kubernetes StatefulSet down ({{ $labels.namespace }}/{{ $labels.statefulset }}) + description: "StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} went down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - alert: KubernetesHpaScalingAbility - expr: 'kube_horizontalpodautoscaler_status_condition{status="false", condition="AbleToScale"} == 1' + - alert: KubernetesHpaScaleInability + expr: '(kube_horizontalpodautoscaler_spec_max_replicas - kube_horizontalpodautoscaler_status_desired_replicas) * on (horizontalpodautoscaler,namespace) (kube_horizontalpodautoscaler_status_condition{condition="ScalingLimited", status="true"} == 1) == 0' for: 2m labels: severity: warning annotations: - summary: Kubernetes HPA scaling ability (instance {{ $labels.instance }}) - description: "Pod is unable to scale\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + summary: Kubernetes HPA scale inability (instance {{ $labels.instance }}) + description: "HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} is unable to scale\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - alert: KubernetesHpaMetricAvailability + - alert: KubernetesHpaMetricsUnavailability expr: 'kube_horizontalpodautoscaler_status_condition{status="false", condition="ScalingActive"} == 1' for: 0m labels: severity: warning annotations: - summary: Kubernetes HPA metric availability (instance {{ $labels.instance }}) - description: "HPA is not able to collect metrics\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + summary: Kubernetes HPA metrics unavailability (instance {{ $labels.instance }}) + description: "HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} is unable to collect metrics\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - alert: KubernetesHpaScaleCapability - expr: 'kube_horizontalpodautoscaler_status_desired_replicas >= kube_horizontalpodautoscaler_spec_max_replicas' + - alert: KubernetesHpaScaleMaximum + expr: '(kube_horizontalpodautoscaler_status_desired_replicas >= kube_horizontalpodautoscaler_spec_max_replicas) and (kube_horizontalpodautoscaler_spec_max_replicas > 1) and (kube_horizontalpodautoscaler_spec_min_replicas != kube_horizontalpodautoscaler_spec_max_replicas)' for: 2m labels: severity: info annotations: - summary: Kubernetes HPA scale capability (instance {{ $labels.instance }}) - description: "The maximum number of desired Pods has been hit\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + summary: Kubernetes HPA scale maximum (instance {{ $labels.instance }}) + description: "HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} has hit maximum number of desired pods\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: KubernetesHpaUnderutilized expr: 'max(quantile_over_time(0.5, kube_horizontalpodautoscaler_status_desired_replicas[1d]) == kube_horizontalpodautoscaler_spec_min_replicas) by (horizontalpodautoscaler) > 3' @@ -155,7 +164,7 @@ groups: severity: info annotations: summary: Kubernetes HPA underutilized (instance {{ $labels.instance }}) - description: "HPA is constantly at minimum replicas for 50% of the time. Potential cost saving here.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} is constantly at minimum replicas for 50% of the time. Potential cost saving here.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: KubernetesPodNotHealthy expr: 'sum by (namespace, pod) (kube_pod_status_phase{phase=~"Pending|Unknown|Failed"}) > 0' @@ -163,8 +172,8 @@ groups: labels: severity: critical annotations: - summary: Kubernetes Pod not healthy (instance {{ $labels.instance }}) - description: "Pod has been in a non-ready state for longer than 15 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + summary: Kubernetes Pod not healthy ({{ $labels.namespace }}/{{ $labels.pod }}) + description: "Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-running state for longer than 15 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: KubernetesPodCrashLooping expr: 'increase(kube_pod_container_status_restarts_total[1m]) > 3' @@ -172,17 +181,17 @@ groups: labels: severity: warning annotations: - summary: Kubernetes pod crash looping (instance {{ $labels.instance }}) - description: "Pod {{ $labels.pod }} is crash looping\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + summary: Kubernetes pod crash looping ({{ $labels.namespace }}/{{ $labels.pod }}) + description: "Pod {{ $labels.namespace }}/{{ $labels.pod }} is crash looping\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - alert: KubernetesReplicassetMismatch + - alert: KubernetesReplicasetReplicasMismatch expr: 'kube_replicaset_spec_replicas != kube_replicaset_status_ready_replicas' for: 10m labels: severity: warning annotations: - summary: Kubernetes ReplicasSet mismatch (instance {{ $labels.instance }}) - description: "Deployment Replicas mismatch\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + summary: Kubernetes ReplicasSet mismatch ({{ $labels.namespace }}/{{ $labels.replicaset }}) + description: "ReplicaSet {{ $labels.namespace }}/{{ $labels.replicaset }} replicas mismatch\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: KubernetesDeploymentReplicasMismatch expr: 'kube_deployment_spec_replicas != kube_deployment_status_replicas_available' @@ -190,8 +199,8 @@ groups: labels: severity: warning annotations: - summary: Kubernetes Deployment replicas mismatch (instance {{ $labels.instance }}) - description: "Deployment Replicas mismatch\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + summary: Kubernetes Deployment replicas mismatch ({{ $labels.namespace }}/{{ $labels.deployment }}) + description: "Deployment {{ $labels.namespace }}/{{ $labels.deployment }} replicas mismatch\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: KubernetesStatefulsetReplicasMismatch expr: 'kube_statefulset_status_replicas_ready != kube_statefulset_status_replicas' @@ -200,7 +209,7 @@ groups: severity: warning annotations: summary: Kubernetes StatefulSet replicas mismatch (instance {{ $labels.instance }}) - description: "A StatefulSet does not match the expected number of replicas.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "StatefulSet does not match the expected number of replicas.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: KubernetesDeploymentGenerationMismatch expr: 'kube_deployment_status_observed_generation != kube_deployment_metadata_generation' @@ -208,8 +217,8 @@ groups: labels: severity: critical annotations: - summary: Kubernetes Deployment generation mismatch (instance {{ $labels.instance }}) - description: "A Deployment has failed but has not been rolled back.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + summary: Kubernetes Deployment generation mismatch ({{ $labels.namespace }}/{{ $labels.deployment }}) + description: "Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has failed but has not been rolled back.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: KubernetesStatefulsetGenerationMismatch expr: 'kube_statefulset_status_observed_generation != kube_statefulset_metadata_generation' @@ -217,8 +226,8 @@ groups: labels: severity: critical annotations: - summary: Kubernetes StatefulSet generation mismatch (instance {{ $labels.instance }}) - description: "A StatefulSet has failed but has not been rolled back.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + summary: Kubernetes StatefulSet generation mismatch ({{ $labels.namespace }}/{{ $labels.statefulset }}) + description: "StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has failed but has not been rolled back.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: KubernetesStatefulsetUpdateNotRolledOut expr: 'max without (revision) (kube_statefulset_status_current_revision unless kube_statefulset_status_update_revision) * (kube_statefulset_replicas != kube_statefulset_status_replicas_updated)' @@ -226,8 +235,8 @@ groups: labels: severity: warning annotations: - summary: Kubernetes StatefulSet update not rolled out (instance {{ $labels.instance }}) - description: "StatefulSet update has not been rolled out.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + summary: Kubernetes StatefulSet update not rolled out ({{ $labels.namespace }}/{{ $labels.statefulset }}) + description: "StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update has not been rolled out.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: KubernetesDaemonsetRolloutStuck expr: 'kube_daemonset_status_number_ready / kube_daemonset_status_desired_number_scheduled * 100 < 100 or kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_current_number_scheduled > 0' @@ -235,8 +244,8 @@ groups: labels: severity: warning annotations: - summary: Kubernetes DaemonSet rollout stuck (instance {{ $labels.instance }}) - description: "Some Pods of DaemonSet are not scheduled or not ready\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + summary: Kubernetes DaemonSet rollout stuck ({{ $labels.namespace }}/{{ $labels.daemonset }}) + description: "Some Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are not scheduled or not ready\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: KubernetesDaemonsetMisscheduled expr: 'kube_daemonset_status_number_misscheduled > 0' @@ -244,8 +253,8 @@ groups: labels: severity: critical annotations: - summary: Kubernetes DaemonSet misscheduled (instance {{ $labels.instance }}) - description: "Some DaemonSet Pods are running where they are not supposed to run\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + summary: Kubernetes DaemonSet misscheduled ({{ $labels.namespace }}/{{ $labels.daemonset }}) + description: "Some Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are running where they are not supposed to run\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: KubernetesCronjobTooLong expr: 'time() - kube_cronjob_next_schedule_time > 3600' @@ -253,20 +262,20 @@ groups: labels: severity: warning annotations: - summary: Kubernetes CronJob too long (instance {{ $labels.instance }}) + summary: Kubernetes CronJob too long ({{ $labels.namespace }}/{{ $labels.cronjob }}) description: "CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is taking more than 1h to complete.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: KubernetesJobSlowCompletion - expr: 'kube_job_spec_completions - kube_job_status_succeeded > 0' + expr: 'kube_job_spec_completions - kube_job_status_succeeded - kube_job_status_failed > 0' for: 12h labels: severity: critical annotations: - summary: Kubernetes job slow completion (instance {{ $labels.instance }}) + summary: Kubernetes job slow completion ({{ $labels.namespace }}/{{ $labels.job_name }}) description: "Kubernetes Job {{ $labels.namespace }}/{{ $labels.job_name }} did not complete in time.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: KubernetesApiServerErrors - expr: 'sum(rate(apiserver_request_total{job="apiserver",code=~"^(?:5..)$"}[1m])) / sum(rate(apiserver_request_total{job="apiserver"}[1m])) * 100 > 3' + expr: 'sum(rate(apiserver_request_total{job="apiserver",code=~"(?:5..)"}[1m])) by (instance, job) / sum(rate(apiserver_request_total{job="apiserver"}[1m])) by (instance, job) * 100 > 3' for: 2m labels: severity: critical @@ -302,7 +311,7 @@ groups: description: "A client certificate used to authenticate to the apiserver is expiring in less than 24.0 hours.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: KubernetesApiServerLatency - expr: 'histogram_quantile(0.99, sum(rate(apiserver_request_latencies_bucket{subresource!="log",verb!~"^(?:CONNECT|WATCHLIST|WATCH|PROXY)$"} [10m])) WITHOUT (instance, resource)) / 1e+06 > 1' + expr: 'histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{verb!~"(?:CONNECT|WATCHLIST|WATCH|PROXY)"} [10m])) WITHOUT (subresource)) > 1' for: 2m labels: severity: warning diff --git a/conf/prometheus/alerts/mysqld-exporter.yml b/conf/prometheus/alerts/mysqld-exporter.yml index ad8ed5f..380fca3 100644 --- a/conf/prometheus/alerts/mysqld-exporter.yml +++ b/conf/prometheus/alerts/mysqld-exporter.yml @@ -22,6 +22,15 @@ groups: summary: MySQL too many connections (> 80%) (instance {{ $labels.instance }}) description: "More than 80% of MySQL connections are in use on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + - alert: MysqlHighPreparedStatementsUtilization(>80%) + expr: 'max_over_time(mysql_global_status_prepared_stmt_count[1m]) / mysql_global_variables_max_prepared_stmt_count * 100 > 80' + for: 2m + labels: + severity: warning + annotations: + summary: MySQL high prepared statements utilization (> 80%) (instance {{ $labels.instance }}) + description: "High utilization of prepared statements (>80%) on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + - alert: MysqlHighThreadsRunning expr: 'max_over_time(mysql_global_status_threads_running[1m]) / mysql_global_variables_max_connections * 100 > 60' for: 2m diff --git a/conf/prometheus/alerts/node.yml b/conf/prometheus/alerts/node-exporter.yml similarity index 52% rename from conf/prometheus/alerts/node.yml rename to conf/prometheus/alerts/node-exporter.yml index 364bd44..6a465d9 100644 --- a/conf/prometheus/alerts/node.yml +++ b/conf/prometheus/alerts/node-exporter.yml @@ -1,8 +1,11 @@ groups: - - name: Node - rules: + +- name: NodeExporter + + rules: + - alert: HostOutOfMemory - expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10 + expr: '(node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' for: 2m labels: severity: warning @@ -11,7 +14,7 @@ groups: description: "Node memory is filling up (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HostMemoryUnderMemoryPressure - expr: rate(node_vmstat_pgmajfault[1m]) > 1000 + expr: '(rate(node_vmstat_pgmajfault[1m]) > 1000) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' for: 2m labels: severity: warning @@ -19,8 +22,17 @@ groups: summary: Host memory under memory pressure (instance {{ $labels.instance }}) description: "The node is under heavy memory pressure. High rate of major page faults\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + - alert: HostMemoryIsUnderutilized + expr: '(100 - (avg_over_time(node_memory_MemAvailable_bytes[30m]) / node_memory_MemTotal_bytes * 100) < 20) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + for: 1w + labels: + severity: info + annotations: + summary: Host Memory is underutilized (instance {{ $labels.instance }}) + description: "Node memory is < 20% for 1 week. Consider reducing memory space. (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + - alert: HostUnusualNetworkThroughputIn - expr: sum by (instance) (rate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100 + expr: '(sum by (instance) (rate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' for: 5m labels: severity: warning @@ -29,7 +41,7 @@ groups: description: "Host network interfaces are probably receiving too much data (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HostUnusualNetworkThroughputOut - expr: sum by (instance) (rate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100 + expr: '(sum by (instance) (rate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' for: 5m labels: severity: warning @@ -38,7 +50,7 @@ groups: description: "Host network interfaces are probably sending too much data (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HostUnusualDiskReadRate - expr: sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50 + expr: '(sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' for: 5m labels: severity: warning @@ -47,7 +59,7 @@ groups: description: "Disk is probably reading too much data (> 50 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HostUnusualDiskWriteRate - expr: sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50 + expr: '(sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' for: 2m labels: severity: warning @@ -55,11 +67,8 @@ groups: summary: Host unusual disk write rate (instance {{ $labels.instance }}) description: "Disk is probably writing too much data (> 50 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - # Please add ignored mountpoints in node_exporter parameters like - # "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)". - # Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users. - alert: HostOutOfDiskSpace - expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0 + expr: '((node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' for: 2m labels: severity: warning @@ -67,11 +76,8 @@ groups: summary: Host out of disk space (instance {{ $labels.instance }}) description: "Disk is almost full (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - # Please add ignored mountpoints in node_exporter parameters like - # "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)". - # Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users. - alert: HostDiskWillFillIn24Hours - expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly == 0 + expr: '((node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' for: 2m labels: severity: warning @@ -80,7 +86,7 @@ groups: description: "Filesystem is predicted to run out of space within the next 24 hours at current write rate\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HostOutOfInodes - expr: node_filesystem_files_free{mountpoint ="/rootfs"} / node_filesystem_files{mountpoint="/rootfs"} * 100 < 10 and ON (instance, device, mountpoint) node_filesystem_readonly{mountpoint="/rootfs"} == 0 + expr: '(node_filesystem_files_free{fstype!="msdosfs"} / node_filesystem_files{fstype!="msdosfs"} * 100 < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' for: 2m labels: severity: warning @@ -88,8 +94,17 @@ groups: summary: Host out of inodes (instance {{ $labels.instance }}) description: "Disk is almost running out of available inodes (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + - alert: HostFilesystemDeviceError + expr: 'node_filesystem_device_error == 1' + for: 2m + labels: + severity: critical + annotations: + summary: Host filesystem device error (instance {{ $labels.instance }}) + description: "{{ $labels.instance }}: Device error with the {{ $labels.mountpoint }} filesystem\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + - alert: HostInodesWillFillIn24Hours - expr: node_filesystem_files_free{mountpoint ="/rootfs"} / node_filesystem_files{mountpoint="/rootfs"} * 100 < 10 and predict_linear(node_filesystem_files_free{mountpoint="/rootfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly{mountpoint="/rootfs"} == 0 + expr: '(node_filesystem_files_free{fstype!="msdosfs"} / node_filesystem_files{fstype!="msdosfs"} * 100 < 10 and predict_linear(node_filesystem_files_free{fstype!="msdosfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly{fstype!="msdosfs"} == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' for: 2m labels: severity: warning @@ -98,7 +113,7 @@ groups: description: "Filesystem is predicted to run out of inodes within the next 24 hours at current write rate\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HostUnusualDiskReadLatency - expr: rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0 + expr: '(rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' for: 2m labels: severity: warning @@ -107,25 +122,34 @@ groups: description: "Disk latency is growing (read operations > 100ms)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HostUnusualDiskWriteLatency - expr: rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 0.25 and rate(node_disk_writes_completed_total[1m]) > 0 + expr: '(rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 0.1 and rate(node_disk_writes_completed_total[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' for: 2m labels: severity: warning annotations: summary: Host unusual disk write latency (instance {{ $labels.instance }}) - description: "Disk latency is growing (write operations > 250ms)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "Disk latency is growing (write operations > 100ms)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HostHighCpuLoad - expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[2m])) * 100) > 80 - for: 0m + expr: '(sum by (instance) (avg by (mode, instance) (rate(node_cpu_seconds_total{mode!="idle"}[2m]))) > 0.8) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + for: 10m labels: severity: warning annotations: summary: Host high CPU load (instance {{ $labels.instance }}) description: "CPU load is > 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + - alert: HostCpuIsUnderutilized + expr: '(100 - (rate(node_cpu_seconds_total{mode="idle"}[30m]) * 100) < 20) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + for: 1w + labels: + severity: info + annotations: + summary: Host CPU is underutilized (instance {{ $labels.instance }}) + description: "CPU load is < 20% for 1 week. Consider reducing the number of CPUs.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + - alert: HostCpuStealNoisyNeighbor - expr: avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10 + expr: '(avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' for: 0m labels: severity: warning @@ -133,20 +157,38 @@ groups: summary: Host CPU steal noisy neighbor (instance {{ $labels.instance }}) description: "CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - # 1000 context switches is an arbitrary number. - # Alert threshold depends on nature of application. - # Please read: https://github.com/samber/awesome-prometheus-alerts/issues/58 - - alert: HostContextSwitching - expr: (rate(node_context_switches_total[5m])) / (count without(cpu, mode) (node_cpu_seconds_total{mode="idle"})) > 4000 + - alert: HostCpuHighIowait + expr: '(avg by (instance) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) * 100 > 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' for: 0m labels: severity: warning annotations: - summary: Host context switching (instance {{ $labels.instance }}) - description: "Context switching is growing on node (> 4000 / s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + summary: Host CPU high iowait (instance {{ $labels.instance }}) + description: "CPU iowait > 10%. A high iowait means that you are disk or network bound.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostUnusualDiskIo + expr: '(rate(node_disk_io_time_seconds_total[1m]) > 0.5) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + for: 5m + labels: + severity: warning + annotations: + summary: Host unusual disk IO (instance {{ $labels.instance }}) + description: "Time spent in IO is too high on {{ $labels.instance }}. Check storage for issues.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: HostContextSwitchingHigh + expr: '(rate(node_context_switches_total[15m])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) +/ +(rate(node_context_switches_total[1d])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) > 2 +' + for: 0m + labels: + severity: warning + annotations: + summary: Host context switching high (instance {{ $labels.instance }}) + description: "Context switching is growing on the node (twice the daily average during the last 15m)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HostSwapIsFillingUp - expr: (1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80 + expr: '((1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' for: 2m labels: severity: warning @@ -155,7 +197,7 @@ groups: description: "Swap is filling up (>80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HostSystemdServiceCrashed - expr: node_systemd_unit_state{state="failed"} == 1 + expr: '(node_systemd_unit_state{state="failed"} == 1) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' for: 0m labels: severity: warning @@ -164,7 +206,7 @@ groups: description: "systemd service crashed\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HostPhysicalComponentTooHot - expr: node_hwmon_temp_celsius > 75 + expr: '((node_hwmon_temp_celsius * ignoring(label) group_left(instance, job, node, sensor) node_hwmon_sensor_label{label!="tctl"} > 75)) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' for: 5m labels: severity: warning @@ -173,7 +215,7 @@ groups: description: "Physical hardware component too hot\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HostNodeOvertemperatureAlarm - expr: node_hwmon_temp_crit_alarm_celsius == 1 + expr: '((node_hwmon_temp_crit_alarm_celsius == 1) or (node_hwmon_temp_alarm == 1)) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' for: 0m labels: severity: critical @@ -182,16 +224,16 @@ groups: description: "Physical node temperature alarm triggered\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HostRaidArrayGotInactive - expr: node_md_state{state="inactive"} > 0 + expr: '(node_md_state{state="inactive"} > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' for: 0m labels: severity: critical annotations: summary: Host RAID array got inactive (instance {{ $labels.instance }}) - description: "RAID array {{ $labels.device }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + description: "RAID array {{ $labels.device }} is in a degraded state due to one or more disk failures. The number of spare drives is insufficient to fix the issue automatically.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HostRaidDiskFailure - expr: node_md_disks{state="failed"} > 0 + expr: '(node_md_disks{state="failed"} > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' for: 2m labels: severity: warning @@ -199,17 +241,17 @@ groups: summary: Host RAID disk failure (instance {{ $labels.instance }}) description: "At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - #- alert: HostKernelVersionDeviations - # expr: count(sum(label_replace(node_uname_info, "kernel", "$1", "release", "([0-9]+.[0-9]+.[0-9]+).*")) by (kernel)) > 1 - # for: 6h - # labels: - # severity: warning - # annotations: - # summary: Host kernel version deviations (instance {{ $labels.instance }}) - # description: "Different kernel versions are running\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + - alert: HostKernelVersionDeviations + expr: '(count(sum(label_replace(node_uname_info, "kernel", "$1", "release", "([0-9]+.[0-9]+.[0-9]+).*")) by (kernel)) > 1) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + for: 6h + labels: + severity: warning + annotations: + summary: Host kernel version deviations (instance {{ $labels.instance }}) + description: "Different kernel versions are running\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HostOomKillDetected - expr: increase(node_vmstat_oom_kill[1m]) > 0 + expr: '(increase(node_vmstat_oom_kill[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' for: 0m labels: severity: warning @@ -218,7 +260,7 @@ groups: description: "OOM kill detected\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HostEdacCorrectableErrorsDetected - expr: increase(node_edac_correctable_errors_total[1m]) > 0 + expr: '(increase(node_edac_correctable_errors_total[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' for: 0m labels: severity: info @@ -227,7 +269,7 @@ groups: description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HostEdacUncorrectableErrorsDetected - expr: node_edac_uncorrectable_errors_total > 0 + expr: '(node_edac_uncorrectable_errors_total > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' for: 0m labels: severity: warning @@ -236,7 +278,7 @@ groups: description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HostNetworkReceiveErrors - expr: rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01 + expr: '(rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' for: 2m labels: severity: warning @@ -245,7 +287,7 @@ groups: description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} receive errors in the last two minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HostNetworkTransmitErrors - expr: rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01 + expr: '(rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' for: 2m labels: severity: warning @@ -254,7 +296,7 @@ groups: description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} transmit errors in the last two minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HostNetworkInterfaceSaturated - expr: (rate(node_network_receive_bytes_total{device!~"^tap.*"}[1m]) + rate(node_network_transmit_bytes_total{device!~"^tap.*"}[1m])) / node_network_speed_bytes{device!~"^tap.*"} > 0.8 < 10000 + expr: '((rate(node_network_receive_bytes_total{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"}[1m]) + rate(node_network_transmit_bytes_total{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"}[1m])) / node_network_speed_bytes{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"} > 0.8 < 10000) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' for: 1m labels: severity: warning @@ -263,7 +305,7 @@ groups: description: "The network interface \"{{ $labels.device }}\" on \"{{ $labels.instance }}\" is getting overloaded.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HostNetworkBondDegraded - expr: (node_bonding_active - node_bonding_slaves) != 0 + expr: '((node_bonding_active - node_bonding_slaves) != 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' for: 2m labels: severity: warning @@ -272,7 +314,7 @@ groups: description: "Bond \"{{ $labels.device }}\" degraded on \"{{ $labels.instance }}\".\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HostConntrackLimit - expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8 + expr: '(node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' for: 5m labels: severity: warning @@ -281,8 +323,8 @@ groups: description: "The number of conntrack is approaching limit\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HostClockSkew - expr: (node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0) - for: 2m + expr: '((node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0)) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' + for: 10m labels: severity: warning annotations: @@ -290,7 +332,7 @@ groups: description: "Clock skew detected. Clock is out of sync. Ensure NTP is configured correctly on this host.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HostClockNotSynchronising - expr: min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16 + expr: '(min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' for: 2m labels: severity: warning @@ -299,7 +341,7 @@ groups: description: "Clock not synchronising. Ensure NTP is configured on this host.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - alert: HostRequiresReboot - expr: node_reboot_required > 0 + expr: '(node_reboot_required > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}' for: 4h labels: severity: info diff --git a/conf/prometheus/alerts/postgres-exporter.yml b/conf/prometheus/alerts/postgres-exporter.yml new file mode 100644 index 0000000..96ae5ea --- /dev/null +++ b/conf/prometheus/alerts/postgres-exporter.yml @@ -0,0 +1,194 @@ +groups: + +- name: PostgresExporter + + rules: + + - alert: PostgresqlDown + expr: 'pg_up == 0' + for: 0m + labels: + severity: critical + annotations: + summary: Postgresql down (instance {{ $labels.instance }}) + description: "Postgresql instance is down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PostgresqlRestarted + expr: 'time() - pg_postmaster_start_time_seconds < 60' + for: 0m + labels: + severity: critical + annotations: + summary: Postgresql restarted (instance {{ $labels.instance }}) + description: "Postgresql restarted\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PostgresqlExporterError + expr: 'pg_exporter_last_scrape_error > 0' + for: 0m + labels: + severity: critical + annotations: + summary: Postgresql exporter error (instance {{ $labels.instance }}) + description: "Postgresql exporter is showing errors. A query may be buggy in query.yaml\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PostgresqlTableNotAutoVacuumed + expr: '(pg_stat_user_tables_last_autovacuum > 0) and (time() - pg_stat_user_tables_last_autovacuum) > 60 * 60 * 24 * 10' + for: 0m + labels: + severity: warning + annotations: + summary: Postgresql table not auto vacuumed (instance {{ $labels.instance }}) + description: "Table {{ $labels.relname }} has not been auto vacuumed for 10 days\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PostgresqlTableNotAutoAnalyzed + expr: '(pg_stat_user_tables_last_autoanalyze > 0) and (time() - pg_stat_user_tables_last_autoanalyze) > 24 * 60 * 60 * 10' + for: 0m + labels: + severity: warning + annotations: + summary: Postgresql table not auto analyzed (instance {{ $labels.instance }}) + description: "Table {{ $labels.relname }} has not been auto analyzed for 10 days\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PostgresqlTooManyConnections + expr: 'sum by (instance, job, server) (pg_stat_activity_count) > min by (instance, job, server) (pg_settings_max_connections * 0.8)' + for: 2m + labels: + severity: warning + annotations: + summary: Postgresql too many connections (instance {{ $labels.instance }}) + description: "PostgreSQL instance has too many connections (> 80%).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PostgresqlNotEnoughConnections + expr: 'sum by (datname) (pg_stat_activity_count{datname!~"template.*|postgres"}) < 5' + for: 2m + labels: + severity: warning + annotations: + summary: Postgresql not enough connections (instance {{ $labels.instance }}) + description: "PostgreSQL instance should have more connections (> 5)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PostgresqlDeadLocks + expr: 'increase(pg_stat_database_deadlocks{datname!~"template.*|postgres"}[1m]) > 5' + for: 0m + labels: + severity: warning + annotations: + summary: Postgresql dead locks (instance {{ $labels.instance }}) + description: "PostgreSQL has dead-locks\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PostgresqlHighRollbackRate + expr: 'sum by (namespace,datname) ((rate(pg_stat_database_xact_rollback{datname!~"template.*|postgres",datid!="0"}[3m])) / ((rate(pg_stat_database_xact_rollback{datname!~"template.*|postgres",datid!="0"}[3m])) + (rate(pg_stat_database_xact_commit{datname!~"template.*|postgres",datid!="0"}[3m])))) > 0.02' + for: 0m + labels: + severity: warning + annotations: + summary: Postgresql high rollback rate (instance {{ $labels.instance }}) + description: "Ratio of transactions being aborted compared to committed is > 2 %\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PostgresqlCommitRateLow + expr: 'rate(pg_stat_database_xact_commit[1m]) < 10' + for: 2m + labels: + severity: critical + annotations: + summary: Postgresql commit rate low (instance {{ $labels.instance }}) + description: "Postgresql seems to be processing very few transactions\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PostgresqlLowXidConsumption + expr: 'rate(pg_txid_current[1m]) < 5' + for: 2m + labels: + severity: warning + annotations: + summary: Postgresql low XID consumption (instance {{ $labels.instance }}) + description: "Postgresql seems to be consuming transaction IDs very slowly\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PostgresqlHighRateStatementTimeout + expr: 'rate(postgresql_errors_total{type="statement_timeout"}[1m]) > 3' + for: 0m + labels: + severity: critical + annotations: + summary: Postgresql high rate statement timeout (instance {{ $labels.instance }}) + description: "Postgres transactions showing high rate of statement timeouts\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PostgresqlHighRateDeadlock + expr: 'increase(postgresql_errors_total{type="deadlock_detected"}[1m]) > 1' + for: 0m + labels: + severity: critical + annotations: + summary: Postgresql high rate deadlock (instance {{ $labels.instance }}) + description: "Postgres detected deadlocks\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PostgresqlUnusedReplicationSlot + expr: 'pg_replication_slots_active == 0' + for: 1m + labels: + severity: warning + annotations: + summary: Postgresql unused replication slot (instance {{ $labels.instance }}) + description: "Unused Replication Slots\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PostgresqlTooManyDeadTuples + expr: '((pg_stat_user_tables_n_dead_tup > 10000) / (pg_stat_user_tables_n_live_tup + pg_stat_user_tables_n_dead_tup)) >= 0.1' + for: 2m + labels: + severity: warning + annotations: + summary: Postgresql too many dead tuples (instance {{ $labels.instance }}) + description: "PostgreSQL dead tuples is too large\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PostgresqlConfigurationChanged + expr: '{__name__=~"pg_settings_.*"} != ON(__name__, instance) {__name__=~"pg_settings_([^t]|t[^r]|tr[^a]|tra[^n]|tran[^s]|trans[^a]|transa[^c]|transac[^t]|transact[^i]|transacti[^o]|transactio[^n]|transaction[^_]|transaction_[^r]|transaction_r[^e]|transaction_re[^a]|transaction_rea[^d]|transaction_read[^_]|transaction_read_[^o]|transaction_read_o[^n]|transaction_read_on[^l]|transaction_read_onl[^y]).*"} OFFSET 5m' + for: 0m + labels: + severity: info + annotations: + summary: Postgresql configuration changed (instance {{ $labels.instance }}) + description: "Postgres Database configuration change has occurred\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PostgresqlSslCompressionActive + expr: 'sum(pg_stat_ssl_compression) > 0' + for: 0m + labels: + severity: critical + annotations: + summary: Postgresql SSL compression active (instance {{ $labels.instance }}) + description: "Database connections with SSL compression enabled. This may add significant jitter in replication delay. Replicas should turn off SSL compression via `sslcompression=0` in `recovery.conf`.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PostgresqlTooManyLocksAcquired + expr: '((sum (pg_locks_count)) / (pg_settings_max_locks_per_transaction * pg_settings_max_connections)) > 0.20' + for: 2m + labels: + severity: critical + annotations: + summary: Postgresql too many locks acquired (instance {{ $labels.instance }}) + description: "Too many locks acquired on the database. If this alert happens frequently, we may need to increase the postgres setting max_locks_per_transaction.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PostgresqlBloatIndexHigh(>80%) + expr: 'pg_bloat_btree_bloat_pct > 80 and on (idxname) (pg_bloat_btree_real_size > 100000000)' + for: 1h + labels: + severity: warning + annotations: + summary: Postgresql bloat index high (> 80%) (instance {{ $labels.instance }}) + description: "The index {{ $labels.idxname }} is bloated. You should execute `REINDEX INDEX CONCURRENTLY {{ $labels.idxname }};`\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PostgresqlBloatTableHigh(>80%) + expr: 'pg_bloat_table_bloat_pct > 80 and on (relname) (pg_bloat_table_real_size > 200000000)' + for: 1h + labels: + severity: warning + annotations: + summary: Postgresql bloat table high (> 80%) (instance {{ $labels.instance }}) + description: "The table {{ $labels.relname }} is bloated. You should execute `VACUUM {{ $labels.relname }};`\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" + + - alert: PostgresqlInvalidIndex + expr: 'pg_general_index_info_pg_relation_size{indexrelname=~".*ccnew.*"}' + for: 6h + labels: + severity: warning + annotations: + summary: Postgresql invalid index (instance {{ $labels.instance }}) + description: "The table {{ $labels.relname }} has an invalid index: {{ $labels.indexrelname }}. You should execute `DROP INDEX {{ $labels.indexrelname }};`\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" diff --git a/conf/prometheus/alerts/prometheus.yml b/conf/prometheus/alerts/prometheus.yml deleted file mode 100644 index f55aabb..0000000 --- a/conf/prometheus/alerts/prometheus.yml +++ /dev/null @@ -1,236 +0,0 @@ -groups: - - name: Prometheus - rules: - - alert: PrometheusJobMissing - expr: absent(up{job="prometheus"}) - for: 0m - labels: - severity: warning - annotations: - summary: Prometheus job missing (instance {{ $labels.instance }}) - description: "A Prometheus job has disappeared\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - - alert: PrometheusTargetMissing - expr: up == 0 - for: 0m - labels: - severity: critical - annotations: - summary: Prometheus target missing (instance {{ $labels.instance }}) - description: "A Prometheus target has disappeared. An exporter might be crashed.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - - alert: PrometheusAllTargetsMissing - expr: count by (job) (up) == 0 - for: 0m - labels: - severity: critical - annotations: - summary: Prometheus all targets missing (instance {{ $labels.instance }}) - description: "A Prometheus job does not have living target anymore.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - - alert: PrometheusConfigurationReloadFailure - expr: prometheus_config_last_reload_successful != 1 - for: 0m - labels: - severity: warning - annotations: - summary: Prometheus configuration reload failure (instance {{ $labels.instance }}) - description: "Prometheus configuration reload error\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - - alert: PrometheusTooManyRestarts - expr: changes(process_start_time_seconds{job=~"prometheus|pushgateway|alertmanager"}[15m]) > 2 - for: 0m - labels: - severity: warning - annotations: - summary: Prometheus too many restarts (instance {{ $labels.instance }}) - description: "Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - - alert: PrometheusAlertmanagerJobMissing - expr: absent(up{job="alertmanager"}) - for: 0m - labels: - severity: warning - annotations: - summary: Prometheus AlertManager job missing (instance {{ $labels.instance }}) - description: "A Prometheus AlertManager job has disappeared\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - - alert: PrometheusAlertmanagerConfigurationReloadFailure - expr: alertmanager_config_last_reload_successful != 1 - for: 0m - labels: - severity: warning - annotations: - summary: Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }}) - description: "AlertManager configuration reload error\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - - alert: PrometheusAlertmanagerConfigNotSynced - expr: count(count_values("config_hash", alertmanager_config_hash)) > 1 - for: 0m - labels: - severity: warning - annotations: - summary: Prometheus AlertManager config not synced (instance {{ $labels.instance }}) - description: "Configurations of AlertManager cluster instances are out of sync\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - - alert: PrometheusAlertmanagerE2eDeadManSwitch - expr: vector(1) - for: 0m - labels: - severity: critical - annotations: - summary: Prometheus AlertManager E2E dead man switch (instance {{ $labels.instance }}) - description: "Prometheus DeadManSwitch is an always-firing alert. It's used as an end-to-end test of Prometheus through the Alertmanager.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - - alert: PrometheusNotConnectedToAlertmanager - expr: prometheus_notifications_alertmanagers_discovered < 1 - for: 0m - labels: - severity: critical - annotations: - summary: Prometheus not connected to alertmanager (instance {{ $labels.instance }}) - description: "Prometheus cannot connect the alertmanager\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - - alert: PrometheusRuleEvaluationFailures - expr: increase(prometheus_rule_evaluation_failures_total[3m]) > 0 - for: 0m - labels: - severity: critical - annotations: - summary: Prometheus rule evaluation failures (instance {{ $labels.instance }}) - description: "Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - - alert: PrometheusTemplateTextExpansionFailures - expr: increase(prometheus_template_text_expansion_failures_total[3m]) > 0 - for: 0m - labels: - severity: critical - annotations: - summary: Prometheus template text expansion failures (instance {{ $labels.instance }}) - description: "Prometheus encountered {{ $value }} template text expansion failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - - alert: PrometheusRuleEvaluationSlow - expr: prometheus_rule_group_last_duration_seconds > prometheus_rule_group_interval_seconds - for: 5m - labels: - severity: warning - annotations: - summary: Prometheus rule evaluation slow (instance {{ $labels.instance }}) - description: "Prometheus rule evaluation took more time than the scheduled interval. It indicates a slower storage backend access or too complex query.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - - alert: PrometheusNotificationsBacklog - expr: min_over_time(prometheus_notifications_queue_length[10m]) > 0 - for: 0m - labels: - severity: warning - annotations: - summary: Prometheus notifications backlog (instance {{ $labels.instance }}) - description: "The Prometheus notification queue has not been empty for 10 minutes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - - alert: PrometheusAlertmanagerNotificationFailing - expr: rate(alertmanager_notifications_failed_total[1m]) > 0 - for: 0m - labels: - severity: critical - annotations: - summary: Prometheus AlertManager notification failing (instance {{ $labels.instance }}) - description: "Alertmanager is failing sending notifications\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - - alert: PrometheusTargetEmpty - expr: prometheus_sd_discovered_targets == 0 - for: 0m - labels: - severity: critical - annotations: - summary: Prometheus target empty (instance {{ $labels.instance }}) - description: "Prometheus has no target in service discovery\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - - alert: PrometheusTargetScrapingSlow - expr: prometheus_target_interval_length_seconds{quantile="0.9"} > 60 - for: 5m - labels: - severity: warning - annotations: - summary: Prometheus target scraping slow (instance {{ $labels.instance }}) - description: "Prometheus is scraping exporters slowly\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - - alert: PrometheusLargeScrape - expr: increase(prometheus_target_scrapes_exceeded_sample_limit_total[10m]) > 10 - for: 5m - labels: - severity: warning - annotations: - summary: Prometheus large scrape (instance {{ $labels.instance }}) - description: "Prometheus has many scrapes that exceed the sample limit\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - - alert: PrometheusTargetScrapeDuplicate - expr: increase(prometheus_target_scrapes_sample_duplicate_timestamp_total[5m]) > 0 - for: 0m - labels: - severity: warning - annotations: - summary: Prometheus target scrape duplicate (instance {{ $labels.instance }}) - description: "Prometheus has many samples rejected due to duplicate timestamps but different values\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - - alert: PrometheusTsdbCheckpointCreationFailures - expr: increase(prometheus_tsdb_checkpoint_creations_failed_total[1m]) > 0 - for: 0m - labels: - severity: critical - annotations: - summary: Prometheus TSDB checkpoint creation failures (instance {{ $labels.instance }}) - description: "Prometheus encountered {{ $value }} checkpoint creation failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - - alert: PrometheusTsdbCheckpointDeletionFailures - expr: increase(prometheus_tsdb_checkpoint_deletions_failed_total[1m]) > 0 - for: 0m - labels: - severity: critical - annotations: - summary: Prometheus TSDB checkpoint deletion failures (instance {{ $labels.instance }}) - description: "Prometheus encountered {{ $value }} checkpoint deletion failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - - alert: PrometheusTsdbCompactionsFailed - expr: increase(prometheus_tsdb_compactions_failed_total[1m]) > 0 - for: 0m - labels: - severity: critical - annotations: - summary: Prometheus TSDB compactions failed (instance {{ $labels.instance }}) - description: "Prometheus encountered {{ $value }} TSDB compactions failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - - alert: PrometheusTsdbHeadTruncationsFailed - expr: increase(prometheus_tsdb_head_truncations_failed_total[1m]) > 0 - for: 0m - labels: - severity: critical - annotations: - summary: Prometheus TSDB head truncations failed (instance {{ $labels.instance }}) - description: "Prometheus encountered {{ $value }} TSDB head truncation failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - - alert: PrometheusTsdbReloadFailures - expr: increase(prometheus_tsdb_reloads_failures_total[1m]) > 0 - for: 0m - labels: - severity: critical - annotations: - summary: Prometheus TSDB reload failures (instance {{ $labels.instance }}) - description: "Prometheus encountered {{ $value }} TSDB reload failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - - alert: PrometheusTsdbWalCorruptions - expr: increase(prometheus_tsdb_wal_corruptions_total[1m]) > 0 - for: 0m - labels: - severity: critical - annotations: - summary: Prometheus TSDB WAL corruptions (instance {{ $labels.instance }}) - description: "Prometheus encountered {{ $value }} TSDB WAL corruptions\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" - - - alert: PrometheusTsdbWalTruncationsFailed - expr: increase(prometheus_tsdb_wal_truncations_failed_total[1m]) > 0 - for: 0m - labels: - severity: critical - annotations: - summary: Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }}) - description: "Prometheus encountered {{ $value }} TSDB WAL truncation failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"