feat: alerts

2025-01-06 08:45:05 +01:00 · 2025-01-06 08:45:05 +01:00 · f7d73a0df5
commit f7d73a0df5
parent 8e94b34e63
10 changed files with 795 additions and 499 deletions
--- a/conf/prometheus/alerts/blackbox-exporter.yml
+++ b/conf/prometheus/alerts/blackbox-exporter.yml
@ -0,0 +1,86 @@
 groups:
 - name: BlackboxExporter
  rules:
    - alert: BlackboxProbeFailed
      expr: 'probe_success == 0'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: Blackbox probe failed (instance {{ $labels.instance }})
        description: "Probe failed\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: BlackboxConfigurationReloadFailure
      expr: 'blackbox_exporter_config_last_reload_successful != 1'
      for: 0m
      labels:
        severity: warning
      annotations:
        summary: Blackbox configuration reload failure (instance {{ $labels.instance }})
        description: "Blackbox configuration reload failure\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: BlackboxSlowProbe
      expr: 'avg_over_time(probe_duration_seconds[1m]) > 1'
      for: 1m
      labels:
        severity: warning
      annotations:
        summary: Blackbox slow probe (instance {{ $labels.instance }})
        description: "Blackbox probe took more than 1s to complete\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: BlackboxProbeHttpFailure
      expr: 'probe_http_status_code <= 199 OR probe_http_status_code >= 400'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: Blackbox probe HTTP failure (instance {{ $labels.instance }})
        description: "HTTP status code is not 200-399\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: BlackboxSslCertificateWillExpireSoon
      expr: '3 <= round((last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) / 86400, 0.1) < 20'
      for: 0m
      labels:
        severity: warning
      annotations:
        summary: Blackbox SSL certificate will expire soon (instance {{ $labels.instance }})
        description: "SSL certificate expires in less than 20 days\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: BlackboxSslCertificateWillExpireSoon
      expr: '0 <= round((last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) / 86400, 0.1) < 3'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: Blackbox SSL certificate will expire soon (instance {{ $labels.instance }})
        description: "SSL certificate expires in less than 3 days\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: BlackboxSslCertificateExpired
      expr: 'round((last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) / 86400, 0.1) < 0'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: Blackbox SSL certificate expired (instance {{ $labels.instance }})
        description: "SSL certificate has expired already\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: BlackboxProbeSlowHttp
      expr: 'avg_over_time(probe_http_duration_seconds[1m]) > 1'
      for: 1m
      labels:
        severity: warning
      annotations:
        summary: Blackbox probe slow HTTP (instance {{ $labels.instance }})
        description: "HTTP request took more than 1s\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: BlackboxProbeSlowPing
      expr: 'avg_over_time(probe_icmp_duration_seconds[1m]) > 1'
      for: 1m
      labels:
        severity: warning
      annotations:
        summary: Blackbox probe slow ping (instance {{ $labels.instance }})
        description: "Blackbox ping took more than 1s\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
--- a/conf/prometheus/alerts/blackbox.yml
+++ b/conf/prometheus/alerts/blackbox.yml
@ -1,74 +0,0 @@
 groups:
  - name: blackbox
    rules:
      - alert: BlackboxProbeFailed
        expr: probe_success == 0
        for: 0m
        labels:
          severity: critical
        annotations:
          summary: Blackbox probe failed (instance {{ $labels.instance }})
          description: "Probe failed\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
      - alert: BlackboxSlowProbe
        expr: avg_over_time(probe_duration_seconds[1m]) > 2
        for: 1m
        labels:
          severity: warning
        annotations:
          summary: Blackbox slow probe (instance {{ $labels.instance }})
          description: "Blackbox probe took more than 2s to complete\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
      - alert: BlackboxProbeHttpFailure
        expr: probe_http_status_code <= 199 OR probe_http_status_code >= 400
        for: 0m
        labels:
          severity: critical
        annotations:
          summary: Blackbox probe HTTP failure (instance {{ $labels.instance }})
          description: "HTTP status code is not 200-399\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
      - alert: BlackboxSslCertificateWillExpireSoon
        expr: probe_ssl_earliest_cert_expiry - time() < 86400 * 30
        for: 0m
        labels:
          severity: warning
        annotations:
          summary: Blackbox SSL certificate will expire soon (instance {{ $labels.instance }})
          description: "SSL certificate expires in 30 days\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
      - alert: BlackboxSslCertificateWillExpireSoon
        expr: probe_ssl_earliest_cert_expiry - time() < 86400 * 3
        for: 0m
        labels:
          severity: critical
        annotations:
          summary: Blackbox SSL certificate will expire soon (instance {{ $labels.instance }})
          description: "SSL certificate expires in 3 days\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
      - alert: BlackboxSslCertificateExpired
        expr: probe_ssl_earliest_cert_expiry - time() <= 0
        for: 0m
        labels:
          severity: critical
        annotations:
          summary: Blackbox SSL certificate expired (instance {{ $labels.instance }})
          description: "SSL certificate has expired already\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
      - alert: BlackboxProbeSlowHttp
        expr: avg_over_time(probe_http_duration_seconds[1m]) > 2
        for: 1m
        labels:
          severity: warning
        annotations:
          summary: Blackbox probe slow HTTP (instance {{ $labels.instance }})
          description: "HTTP request took more than 2s\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
      - alert: BlackboxProbeSlowPing
        expr: avg_over_time(probe_icmp_duration_seconds[1m]) > 2
        for: 1m
        labels:
          severity: warning
        annotations:
          summary: Blackbox probe slow ping (instance {{ $labels.instance }})
          description: "Blackbox ping took more than 2s\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
--- a/conf/prometheus/alerts/cadvisor.yml
+++ b/conf/prometheus/alerts/cadvisor.yml
@ -1,68 +0,0 @@
 groups:
  - name: cadvisor
    rules:
      # This rule can be very noisy in dynamic infra with legitimate container start/stop/deployment.
      - alert: ContainerKilled
        expr: time() - container_last_seen > 60
        for: 0m
        labels:
          severity: warning
        annotations:
          summary: Container killed (instance {{ $labels.instance }})
          description: "A container has disappeared\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
      # This rule can be very noisy in dynamic infra with legitimate container start/stop/deployment.
      - alert: ContainerAbsent
        expr: absent(container_last_seen)
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: Container absent (instance {{ $labels.instance }})
          description: "A container is absent for 5 min\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
      - alert: ContainerCpuUsage
        expr: (sum(rate(container_cpu_usage_seconds_total{name!=""}[3m])) BY (instance, name) * 100) > 80
        for: 2m
        labels:
          severity: warning
        annotations:
          summary: Container CPU usage (instance {{ $labels.instance }})
          description: "Container CPU usage is above 80%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
      # See https://medium.com/faun/how-much-is-too-much-the-linux-oomkiller-and-used-memory-d32186f29c9d
      - alert: ContainerMemoryUsage
        expr: (sum(container_memory_working_set_bytes{name!=""}) BY (instance, name) / sum(container_spec_memory_limit_bytes > 0) BY (instance, name) * 100) > 80
        for: 2m
        labels:
          severity: warning
        annotations:
          summary: Container Memory usage (instance {{ $labels.instance }})
          description: "Container Memory usage is above 80%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
      - alert: ContainerVolumeUsage
        expr: (1 - (sum(container_fs_inodes_free{name!=""}) BY (instance) / sum(container_fs_inodes_total) BY (instance))) * 100 > 80
        for: 2m
        labels:
          severity: warning
        annotations:
          summary: Container Volume usage (instance {{ $labels.instance }})
          description: "Container Volume usage is above 80%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
      - alert: ContainerVolumeIoUsage
        expr: (sum(container_fs_io_current{name!=""}) BY (instance, name) * 100) > 80
        for: 2m
        labels:
          severity: warning
        annotations:
          summary: Container Volume IO usage (instance {{ $labels.instance }})
          description: "Container Volume IO usage is above 80%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
      - alert: ContainerHighThrottleRate
        expr: rate(container_cpu_cfs_throttled_seconds_total[3m]) > 1
        for: 2m
        labels:
          severity: warning
        annotations:
          summary: Container high throttle rate (instance {{ $labels.instance }})
          description: "Container is being throttled\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
--- a/conf/prometheus/alerts/embedded-exporter.yml
+++ b/conf/prometheus/alerts/embedded-exporter.yml
@ -0,0 +1,257 @@
 groups:
 - name: EmbeddedExporter
  rules:
    - alert: PrometheusJobMissing
      expr: 'absent(up{job="prometheus"})'
      for: 0m
      labels:
        severity: warning
      annotations:
        summary: Prometheus job missing (instance {{ $labels.instance }})
        description: "A Prometheus job has disappeared\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: PrometheusTargetMissing
      expr: 'up == 0'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: Prometheus target missing (instance {{ $labels.instance }})
        description: "A Prometheus target has disappeared. An exporter might be crashed.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: PrometheusAllTargetsMissing
      expr: 'sum by (job) (up) == 0'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: Prometheus all targets missing (instance {{ $labels.instance }})
        description: "A Prometheus job does not have living target anymore.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: PrometheusTargetMissingWithWarmupTime
      expr: 'sum by (instance, job) ((up == 0) * on (instance) group_right(job) (node_time_seconds - node_boot_time_seconds > 600))'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: Prometheus target missing with warmup time (instance {{ $labels.instance }})
        description: "Allow a job time to start up (10 minutes) before alerting that it's down.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: PrometheusConfigurationReloadFailure
      expr: 'prometheus_config_last_reload_successful != 1'
      for: 0m
      labels:
        severity: warning
      annotations:
        summary: Prometheus configuration reload failure (instance {{ $labels.instance }})
        description: "Prometheus configuration reload error\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: PrometheusTooManyRestarts
      expr: 'changes(process_start_time_seconds{job=~"prometheus|pushgateway|alertmanager"}[15m]) > 2'
      for: 0m
      labels:
        severity: warning
      annotations:
        summary: Prometheus too many restarts (instance {{ $labels.instance }})
        description: "Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: PrometheusAlertmanagerJobMissing
      expr: 'absent(up{job="alertmanager"})'
      for: 0m
      labels:
        severity: warning
      annotations:
        summary: Prometheus AlertManager job missing (instance {{ $labels.instance }})
        description: "A Prometheus AlertManager job has disappeared\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: PrometheusAlertmanagerConfigurationReloadFailure
      expr: 'alertmanager_config_last_reload_successful != 1'
      for: 0m
      labels:
        severity: warning
      annotations:
        summary: Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }})
        description: "AlertManager configuration reload error\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: PrometheusAlertmanagerConfigNotSynced
      expr: 'count(count_values("config_hash", alertmanager_config_hash)) > 1'
      for: 0m
      labels:
        severity: warning
      annotations:
        summary: Prometheus AlertManager config not synced (instance {{ $labels.instance }})
        description: "Configurations of AlertManager cluster instances are out of sync\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: PrometheusAlertmanagerE2eDeadManSwitch
      expr: 'vector(1)'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: Prometheus AlertManager E2E dead man switch (instance {{ $labels.instance }})
        description: "Prometheus DeadManSwitch is an always-firing alert. It's used as an end-to-end test of Prometheus through the Alertmanager.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: PrometheusNotConnectedToAlertmanager
      expr: 'prometheus_notifications_alertmanagers_discovered < 1'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: Prometheus not connected to alertmanager (instance {{ $labels.instance }})
        description: "Prometheus cannot connect the alertmanager\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: PrometheusRuleEvaluationFailures
      expr: 'increase(prometheus_rule_evaluation_failures_total[3m]) > 0'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: Prometheus rule evaluation failures (instance {{ $labels.instance }})
        description: "Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: PrometheusTemplateTextExpansionFailures
      expr: 'increase(prometheus_template_text_expansion_failures_total[3m]) > 0'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: Prometheus template text expansion failures (instance {{ $labels.instance }})
        description: "Prometheus encountered {{ $value }} template text expansion failures\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: PrometheusRuleEvaluationSlow
      expr: 'prometheus_rule_group_last_duration_seconds > prometheus_rule_group_interval_seconds'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Prometheus rule evaluation slow (instance {{ $labels.instance }})
        description: "Prometheus rule evaluation took more time than the scheduled interval. It indicates a slower storage backend access or too complex query.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: PrometheusNotificationsBacklog
      expr: 'min_over_time(prometheus_notifications_queue_length[10m]) > 0'
      for: 0m
      labels:
        severity: warning
      annotations:
        summary: Prometheus notifications backlog (instance {{ $labels.instance }})
        description: "The Prometheus notification queue has not been empty for 10 minutes\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: PrometheusAlertmanagerNotificationFailing
      expr: 'rate(alertmanager_notifications_failed_total[1m]) > 0'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: Prometheus AlertManager notification failing (instance {{ $labels.instance }})
        description: "Alertmanager is failing sending notifications\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: PrometheusTargetEmpty
      expr: 'prometheus_sd_discovered_targets == 0'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: Prometheus target empty (instance {{ $labels.instance }})
        description: "Prometheus has no target in service discovery\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: PrometheusTargetScrapingSlow
      expr: 'prometheus_target_interval_length_seconds{quantile="0.9"} / on (interval, instance, job) prometheus_target_interval_length_seconds{quantile="0.5"} > 1.05'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Prometheus target scraping slow (instance {{ $labels.instance }})
        description: "Prometheus is scraping exporters slowly since it exceeded the requested interval time. Your Prometheus server is under-provisioned.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: PrometheusLargeScrape
      expr: 'increase(prometheus_target_scrapes_exceeded_sample_limit_total[10m]) > 10'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Prometheus large scrape (instance {{ $labels.instance }})
        description: "Prometheus has many scrapes that exceed the sample limit\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: PrometheusTargetScrapeDuplicate
      expr: 'increase(prometheus_target_scrapes_sample_duplicate_timestamp_total[5m]) > 0'
      for: 0m
      labels:
        severity: warning
      annotations:
        summary: Prometheus target scrape duplicate (instance {{ $labels.instance }})
        description: "Prometheus has many samples rejected due to duplicate timestamps but different values\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: PrometheusTsdbCheckpointCreationFailures
      expr: 'increase(prometheus_tsdb_checkpoint_creations_failed_total[1m]) > 0'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: Prometheus TSDB checkpoint creation failures (instance {{ $labels.instance }})
        description: "Prometheus encountered {{ $value }} checkpoint creation failures\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: PrometheusTsdbCheckpointDeletionFailures
      expr: 'increase(prometheus_tsdb_checkpoint_deletions_failed_total[1m]) > 0'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: Prometheus TSDB checkpoint deletion failures (instance {{ $labels.instance }})
        description: "Prometheus encountered {{ $value }} checkpoint deletion failures\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: PrometheusTsdbCompactionsFailed
      expr: 'increase(prometheus_tsdb_compactions_failed_total[1m]) > 0'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: Prometheus TSDB compactions failed (instance {{ $labels.instance }})
        description: "Prometheus encountered {{ $value }} TSDB compactions failures\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: PrometheusTsdbHeadTruncationsFailed
      expr: 'increase(prometheus_tsdb_head_truncations_failed_total[1m]) > 0'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: Prometheus TSDB head truncations failed (instance {{ $labels.instance }})
        description: "Prometheus encountered {{ $value }} TSDB head truncation failures\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: PrometheusTsdbReloadFailures
      expr: 'increase(prometheus_tsdb_reloads_failures_total[1m]) > 0'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: Prometheus TSDB reload failures (instance {{ $labels.instance }})
        description: "Prometheus encountered {{ $value }} TSDB reload failures\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: PrometheusTsdbWalCorruptions
      expr: 'increase(prometheus_tsdb_wal_corruptions_total[1m]) > 0'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: Prometheus TSDB WAL corruptions (instance {{ $labels.instance }})
        description: "Prometheus encountered {{ $value }} TSDB WAL corruptions\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: PrometheusTsdbWalTruncationsFailed
      expr: 'increase(prometheus_tsdb_wal_truncations_failed_total[1m]) > 0'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }})
        description: "Prometheus encountered {{ $value }} TSDB WAL truncation failures\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: PrometheusTimeseriesCardinality
      expr: 'label_replace(count by(__name__) ({__name__=~".+"}), "name", "$1", "__name__", "(.+)") > 10000'
      for: 0m
      labels:
        severity: warning
      annotations:
        summary: Prometheus timeseries cardinality (instance {{ $labels.instance }})
        description: "The \"{{ $labels.name }}\" timeseries cardinality is getting very high: {{ $value }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
--- a/conf/prometheus/alerts/google-cadvisor.yml
+++ b/conf/prometheus/alerts/google-cadvisor.yml
@ -0,0 +1,77 @@
 groups:
 - name: GoogleCadvisor
  rules:
    - alert: ContainerKilled
      expr: 'time() - container_last_seen > 60'
      for: 0m
      labels:
        severity: warning
      annotations:
        summary: Container killed (instance {{ $labels.instance }})
        description: "A container has disappeared\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: ContainerAbsent
      expr: 'absent(container_last_seen)'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Container absent (instance {{ $labels.instance }})
        description: "A container is absent for 5 min\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: ContainerHighCpuUtilization
      expr: '(sum(rate(container_cpu_usage_seconds_total{container!=""}[5m])) by (pod, container) / sum(container_spec_cpu_quota{container!=""}/container_spec_cpu_period{container!=""}) by (pod, container) * 100) > 80'
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: Container High CPU utilization (instance {{ $labels.instance }})
        description: "Container CPU utilization is above 80%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: ContainerHighMemoryUsage
      expr: '(sum(container_memory_working_set_bytes{name!=""}) BY (instance, name) / sum(container_spec_memory_limit_bytes > 0) BY (instance, name) * 100) > 80'
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: Container High Memory usage (instance {{ $labels.instance }})
        description: "Container Memory usage is above 80%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: ContainerVolumeUsage
      expr: '(1 - (sum(container_fs_inodes_free{name!=""}) BY (instance) / sum(container_fs_inodes_total) BY (instance))) * 100 > 80'
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: Container Volume usage (instance {{ $labels.instance }})
        description: "Container Volume usage is above 80%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: ContainerHighThrottleRate
      expr: 'sum(increase(container_cpu_cfs_throttled_periods_total{container!=""}[5m])) by (container, pod, namespace) / sum(increase(container_cpu_cfs_periods_total[5m])) by (container, pod, namespace) > ( 25 / 100 )'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Container high throttle rate (instance {{ $labels.instance }})
        description: "Container is being throttled\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: ContainerLowCpuUtilization
      expr: '(sum(rate(container_cpu_usage_seconds_total{container!=""}[5m])) by (pod, container) / sum(container_spec_cpu_quota{container!=""}/container_spec_cpu_period{container!=""}) by (pod, container) * 100) < 20'
      for: 7d
      labels:
        severity: info
      annotations:
        summary: Container Low CPU utilization (instance {{ $labels.instance }})
        description: "Container CPU utilization is under 20% for 1 week. Consider reducing the allocated CPU.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: ContainerLowMemoryUsage
      expr: '(sum(container_memory_working_set_bytes{name!=""}) BY (instance, name) / sum(container_spec_memory_limit_bytes > 0) BY (instance, name) * 100) < 20'
      for: 7d
      labels:
        severity: info
      annotations:
        summary: Container Low Memory usage (instance {{ $labels.instance }})
        description: "Container Memory usage is under 20% for 1 week. Consider reducing the allocated memory.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
--- a/conf/prometheus/alerts/kubestate-exporter.yml
+++ b/conf/prometheus/alerts/kubestate-exporter.yml
@ -4,50 +4,50 @@ groups:
  rules:
-    - alert: KubernetesNodeReady
+    - alert: KubernetesNodeNotReady
      expr: 'kube_node_status_condition{condition="Ready",status="true"} == 0'
      for: 10m
      labels:
        severity: critical
      annotations:
-        summary: Kubernetes Node ready (instance {{ $labels.instance }})
+        summary: Kubernetes Node ready (node {{ $labels.node }})
        description: "Node {{ $labels.node }} has been unready for a long time\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-    - alert: KubernetesMemoryPressure
+    - alert: KubernetesNodeMemoryPressure
      expr: 'kube_node_status_condition{condition="MemoryPressure",status="true"} == 1'
      for: 2m
      labels:
        severity: critical
      annotations:
-        summary: Kubernetes memory pressure (instance {{ $labels.instance }})
+        summary: Kubernetes memory pressure (node {{ $labels.node }})
-        description: "{{ $labels.node }} has MemoryPressure condition\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Node {{ $labels.node }} has MemoryPressure condition\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-    - alert: KubernetesDiskPressure
+    - alert: KubernetesNodeDiskPressure
      expr: 'kube_node_status_condition{condition="DiskPressure",status="true"} == 1'
      for: 2m
      labels:
        severity: critical
      annotations:
-        summary: Kubernetes disk pressure (instance {{ $labels.instance }})
+        summary: Kubernetes disk pressure (node {{ $labels.node }})
-        description: "{{ $labels.node }} has DiskPressure condition\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Node {{ $labels.node }} has DiskPressure condition\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-    - alert: KubernetesNetworkUnavailable
+    - alert: KubernetesNodeNetworkUnavailable
      expr: 'kube_node_status_condition{condition="NetworkUnavailable",status="true"} == 1'
      for: 2m
      labels:
        severity: critical
      annotations:
-        summary: Kubernetes network unavailable (instance {{ $labels.instance }})
+        summary: Kubernetes Node network unavailable (instance {{ $labels.instance }})
-        description: "{{ $labels.node }} has NetworkUnavailable condition\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Node {{ $labels.node }} has NetworkUnavailable condition\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-    - alert: KubernetesOutOfCapacity
+    - alert: KubernetesNodeOutOfPodCapacity
-      expr: 'sum by (node) ((kube_pod_status_phase{phase="Running"} == 1) + on(uid) group_left(node) (0 * kube_pod_info{pod_template_hash=""})) / sum by (node) (kube_node_status_allocatable{resource="pods"}) * 100 > 90'
+      expr: 'sum by (node) ((kube_pod_status_phase{phase="Running"} == 1) + on(uid, instance) group_left(node) (0 * kube_pod_info{pod_template_hash=""})) / sum by (node) (kube_node_status_allocatable{resource="pods"}) * 100 > 90'
      for: 2m
      labels:
        severity: warning
      annotations:
-        summary: Kubernetes out of capacity (instance {{ $labels.instance }})
+        summary: Kubernetes Node out of pod capacity (instance {{ $labels.instance }})
-        description: "{{ $labels.node }} is out of capacity\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Node {{ $labels.node }} is out of pod capacity\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: KubernetesContainerOomKiller
      expr: '(kube_pod_container_status_restarts_total - kube_pod_container_status_restarts_total offset 10m >= 1) and ignoring (reason) min_over_time(kube_pod_container_status_last_terminated_reason{reason="OOMKilled"}[10m]) == 1'
@ -55,7 +55,7 @@ groups:
      labels:
        severity: warning
      annotations:
-        summary: Kubernetes container oom killer (instance {{ $labels.instance }})
+        summary: Kubernetes container oom killer ({{ $labels.namespace }}/{{ $labels.pod }}:{{ $labels.container }})
        description: "Container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }} has been OOMKilled {{ $value }} times in the last 10 minutes.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: KubernetesJobFailed
@ -64,8 +64,17 @@ groups:
      labels:
        severity: warning
      annotations:
-        summary: Kubernetes Job failed (instance {{ $labels.instance }})
+        summary: Kubernetes Job failed ({{ $labels.namespace }}/{{ $labels.job_name }})
-        description: "Job {{ $labels.namespace }}/{{ $labels.exported_job }} failed to complete\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: KubernetesJobNotStarting
      expr: 'kube_job_status_active == 0 and kube_job_status_failed == 0 and kube_job_status_succeeded == 0 and (time() - kube_job_status_start_time) > 600'
      for: 0m
      labels:
        severity: warning
      annotations:
        summary: Kubernetes Job not starting ({{ $labels.namespace }}/{{ $labels.job_name }})
        description: "Job {{ $labels.namespace }}/{{ $labels.job_name }} did not start for 10 minutes\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: KubernetesCronjobSuspended
      expr: 'kube_cronjob_spec_suspend != 0'
@ -73,7 +82,7 @@ groups:
      labels:
        severity: warning
      annotations:
-        summary: Kubernetes CronJob suspended (instance {{ $labels.instance }})
+        summary: Kubernetes CronJob suspended ({{ $labels.namespace }}/{{ $labels.cronjob }})
        description: "CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is suspended\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: KubernetesPersistentvolumeclaimPending
@ -82,7 +91,7 @@ groups:
      labels:
        severity: warning
      annotations:
-        summary: Kubernetes PersistentVolumeClaim pending (instance {{ $labels.instance }})
+        summary: Kubernetes PersistentVolumeClaim pending ({{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }})
        description: "PersistentVolumeClaim {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is pending\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: KubernetesVolumeOutOfDiskSpace
@ -95,13 +104,13 @@ groups:
        description: "Volume is almost full (< 10% left)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: KubernetesVolumeFullInFourDays
-      expr: 'predict_linear(kubelet_volume_stats_available_bytes[6h], 4 * 24 * 3600) < 0'
+      expr: 'predict_linear(kubelet_volume_stats_available_bytes[6h:5m], 4 * 24 * 3600) < 0'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: Kubernetes Volume full in four days (instance {{ $labels.instance }})
-        description: "{{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is expected to fill up within four days. Currently {{ $value | humanize }}% is available.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Volume under {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is expected to fill up within four days. Currently {{ $value | humanize }}% is available.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: KubernetesPersistentvolumeError
      expr: 'kube_persistentvolume_status_phase{phase=~"Failed|Pending", job="kube-state-metrics"} > 0'
@ -109,8 +118,8 @@ groups:
      labels:
        severity: critical
      annotations:
-        summary: Kubernetes PersistentVolume error (instance {{ $labels.instance }})
+        summary: Kubernetes PersistentVolumeClaim pending ({{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }})
-        description: "Persistent volume is in bad state\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Persistent volume {{ $labels.persistentvolume }} is in bad state\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: KubernetesStatefulsetDown
      expr: 'kube_statefulset_replicas != kube_statefulset_status_replicas_ready > 0'
@ -118,35 +127,35 @@ groups:
      labels:
        severity: critical
      annotations:
-        summary: Kubernetes StatefulSet down (instance {{ $labels.instance }})
+        summary: Kubernetes StatefulSet down ({{ $labels.namespace }}/{{ $labels.statefulset }})
-        description: "A StatefulSet went down\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} went down\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-    - alert: KubernetesHpaScalingAbility
+    - alert: KubernetesHpaScaleInability
-      expr: 'kube_horizontalpodautoscaler_status_condition{status="false", condition="AbleToScale"} == 1'
+      expr: '(kube_horizontalpodautoscaler_spec_max_replicas - kube_horizontalpodautoscaler_status_desired_replicas) * on (horizontalpodautoscaler,namespace) (kube_horizontalpodautoscaler_status_condition{condition="ScalingLimited", status="true"} == 1) == 0'
      for: 2m
      labels:
        severity: warning
      annotations:
-        summary: Kubernetes HPA scaling ability (instance {{ $labels.instance }})
+        summary: Kubernetes HPA scale inability (instance {{ $labels.instance }})
-        description: "Pod is unable to scale\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} is unable to scale\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-    - alert: KubernetesHpaMetricAvailability
+    - alert: KubernetesHpaMetricsUnavailability
      expr: 'kube_horizontalpodautoscaler_status_condition{status="false", condition="ScalingActive"} == 1'
      for: 0m
      labels:
        severity: warning
      annotations:
-        summary: Kubernetes HPA metric availability (instance {{ $labels.instance }})
+        summary: Kubernetes HPA metrics unavailability (instance {{ $labels.instance }})
-        description: "HPA is not able to collect metrics\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} is unable to collect metrics\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-    - alert: KubernetesHpaScaleCapability
+    - alert: KubernetesHpaScaleMaximum
-      expr: 'kube_horizontalpodautoscaler_status_desired_replicas >= kube_horizontalpodautoscaler_spec_max_replicas'
+      expr: '(kube_horizontalpodautoscaler_status_desired_replicas >= kube_horizontalpodautoscaler_spec_max_replicas) and (kube_horizontalpodautoscaler_spec_max_replicas > 1) and (kube_horizontalpodautoscaler_spec_min_replicas != kube_horizontalpodautoscaler_spec_max_replicas)'
      for: 2m
      labels:
        severity: info
      annotations:
-        summary: Kubernetes HPA scale capability (instance {{ $labels.instance }})
+        summary: Kubernetes HPA scale maximum (instance {{ $labels.instance }})
-        description: "The maximum number of desired Pods has been hit\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} has hit maximum number of desired pods\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: KubernetesHpaUnderutilized
      expr: 'max(quantile_over_time(0.5, kube_horizontalpodautoscaler_status_desired_replicas[1d]) == kube_horizontalpodautoscaler_spec_min_replicas) by (horizontalpodautoscaler) > 3'
@ -155,7 +164,7 @@ groups:
        severity: info
      annotations:
        summary: Kubernetes HPA underutilized (instance {{ $labels.instance }})
-        description: "HPA is constantly at minimum replicas for 50% of the time. Potential cost saving here.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} is constantly at minimum replicas for 50% of the time. Potential cost saving here.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: KubernetesPodNotHealthy
      expr: 'sum by (namespace, pod) (kube_pod_status_phase{phase=~"Pending|Unknown|Failed"}) > 0'
@ -163,8 +172,8 @@ groups:
      labels:
        severity: critical
      annotations:
-        summary: Kubernetes Pod not healthy (instance {{ $labels.instance }})
+        summary: Kubernetes Pod not healthy ({{ $labels.namespace }}/{{ $labels.pod }})
-        description: "Pod has been in a non-ready state for longer than 15 minutes.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-running state for longer than 15 minutes.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: KubernetesPodCrashLooping
      expr: 'increase(kube_pod_container_status_restarts_total[1m]) > 3'
@ -172,17 +181,17 @@ groups:
      labels:
        severity: warning
      annotations:
-        summary: Kubernetes pod crash looping (instance {{ $labels.instance }})
+        summary: Kubernetes pod crash looping ({{ $labels.namespace }}/{{ $labels.pod }})
-        description: "Pod {{ $labels.pod }} is crash looping\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Pod {{ $labels.namespace }}/{{ $labels.pod }} is crash looping\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-    - alert: KubernetesReplicassetMismatch
+    - alert: KubernetesReplicasetReplicasMismatch
      expr: 'kube_replicaset_spec_replicas != kube_replicaset_status_ready_replicas'
      for: 10m
      labels:
        severity: warning
      annotations:
-        summary: Kubernetes ReplicasSet mismatch (instance {{ $labels.instance }})
+        summary: Kubernetes ReplicasSet mismatch ({{ $labels.namespace }}/{{ $labels.replicaset }})
-        description: "Deployment Replicas mismatch\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "ReplicaSet {{ $labels.namespace }}/{{ $labels.replicaset }} replicas mismatch\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: KubernetesDeploymentReplicasMismatch
      expr: 'kube_deployment_spec_replicas != kube_deployment_status_replicas_available'
@ -190,8 +199,8 @@ groups:
      labels:
        severity: warning
      annotations:
-        summary: Kubernetes Deployment replicas mismatch (instance {{ $labels.instance }})
+        summary: Kubernetes Deployment replicas mismatch ({{ $labels.namespace }}/{{ $labels.deployment }})
-        description: "Deployment Replicas mismatch\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Deployment {{ $labels.namespace }}/{{ $labels.deployment }} replicas mismatch\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: KubernetesStatefulsetReplicasMismatch
      expr: 'kube_statefulset_status_replicas_ready != kube_statefulset_status_replicas'
@ -200,7 +209,7 @@ groups:
        severity: warning
      annotations:
        summary: Kubernetes StatefulSet replicas mismatch (instance {{ $labels.instance }})
-        description: "A StatefulSet does not match the expected number of replicas.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "StatefulSet does not match the expected number of replicas.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: KubernetesDeploymentGenerationMismatch
      expr: 'kube_deployment_status_observed_generation != kube_deployment_metadata_generation'
@ -208,8 +217,8 @@ groups:
      labels:
        severity: critical
      annotations:
-        summary: Kubernetes Deployment generation mismatch (instance {{ $labels.instance }})
+        summary: Kubernetes Deployment generation mismatch ({{ $labels.namespace }}/{{ $labels.deployment }})
-        description: "A Deployment has failed but has not been rolled back.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has failed but has not been rolled back.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: KubernetesStatefulsetGenerationMismatch
      expr: 'kube_statefulset_status_observed_generation != kube_statefulset_metadata_generation'
@ -217,8 +226,8 @@ groups:
      labels:
        severity: critical
      annotations:
-        summary: Kubernetes StatefulSet generation mismatch (instance {{ $labels.instance }})
+        summary: Kubernetes StatefulSet generation mismatch ({{ $labels.namespace }}/{{ $labels.statefulset }})
-        description: "A StatefulSet has failed but has not been rolled back.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has failed but has not been rolled back.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: KubernetesStatefulsetUpdateNotRolledOut
      expr: 'max without (revision) (kube_statefulset_status_current_revision unless kube_statefulset_status_update_revision) * (kube_statefulset_replicas != kube_statefulset_status_replicas_updated)'
@ -226,8 +235,8 @@ groups:
      labels:
        severity: warning
      annotations:
-        summary: Kubernetes StatefulSet update not rolled out (instance {{ $labels.instance }})
+        summary: Kubernetes StatefulSet update not rolled out ({{ $labels.namespace }}/{{ $labels.statefulset }})
-        description: "StatefulSet update has not been rolled out.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update has not been rolled out.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: KubernetesDaemonsetRolloutStuck
      expr: 'kube_daemonset_status_number_ready / kube_daemonset_status_desired_number_scheduled * 100 < 100 or kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_current_number_scheduled > 0'
@ -235,8 +244,8 @@ groups:
      labels:
        severity: warning
      annotations:
-        summary: Kubernetes DaemonSet rollout stuck (instance {{ $labels.instance }})
+        summary: Kubernetes DaemonSet rollout stuck ({{ $labels.namespace }}/{{ $labels.daemonset }})
-        description: "Some Pods of DaemonSet are not scheduled or not ready\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Some Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are not scheduled or not ready\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: KubernetesDaemonsetMisscheduled
      expr: 'kube_daemonset_status_number_misscheduled > 0'
@ -244,8 +253,8 @@ groups:
      labels:
        severity: critical
      annotations:
-        summary: Kubernetes DaemonSet misscheduled (instance {{ $labels.instance }})
+        summary: Kubernetes DaemonSet misscheduled ({{ $labels.namespace }}/{{ $labels.daemonset }})
-        description: "Some DaemonSet Pods are running where they are not supposed to run\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Some Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are running where they are not supposed to run\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: KubernetesCronjobTooLong
      expr: 'time() - kube_cronjob_next_schedule_time > 3600'
@ -253,20 +262,20 @@ groups:
      labels:
        severity: warning
      annotations:
-        summary: Kubernetes CronJob too long (instance {{ $labels.instance }})
+        summary: Kubernetes CronJob too long ({{ $labels.namespace }}/{{ $labels.cronjob }})
        description: "CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is taking more than 1h to complete.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: KubernetesJobSlowCompletion
-      expr: 'kube_job_spec_completions - kube_job_status_succeeded > 0'
+      expr: 'kube_job_spec_completions - kube_job_status_succeeded - kube_job_status_failed > 0'
      for: 12h
      labels:
        severity: critical
      annotations:
-        summary: Kubernetes job slow completion (instance {{ $labels.instance }})
+        summary: Kubernetes job slow completion ({{ $labels.namespace }}/{{ $labels.job_name }})
        description: "Kubernetes Job {{ $labels.namespace }}/{{ $labels.job_name }} did not complete in time.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: KubernetesApiServerErrors
-      expr: 'sum(rate(apiserver_request_total{job="apiserver",code=~"^(?:5..)$"}[1m])) / sum(rate(apiserver_request_total{job="apiserver"}[1m])) * 100 > 3'
+      expr: 'sum(rate(apiserver_request_total{job="apiserver",code=~"(?:5..)"}[1m])) by (instance, job) / sum(rate(apiserver_request_total{job="apiserver"}[1m])) by (instance, job) * 100 > 3'
      for: 2m
      labels:
        severity: critical
@ -302,7 +311,7 @@ groups:
        description: "A client certificate used to authenticate to the apiserver is expiring in less than 24.0 hours.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: KubernetesApiServerLatency
-      expr: 'histogram_quantile(0.99, sum(rate(apiserver_request_latencies_bucket{subresource!="log",verb!~"^(?:CONNECT|WATCHLIST|WATCH|PROXY)$"} [10m])) WITHOUT (instance, resource)) / 1e+06 > 1'
+      expr: 'histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{verb!~"(?:CONNECT|WATCHLIST|WATCH|PROXY)"} [10m])) WITHOUT (subresource)) > 1'
      for: 2m
      labels:
        severity: warning
--- a/conf/prometheus/alerts/mysqld-exporter.yml
+++ b/conf/prometheus/alerts/mysqld-exporter.yml
@ -22,6 +22,15 @@ groups:
        summary: MySQL too many connections (> 80%) (instance {{ $labels.instance }})
        description: "More than 80% of MySQL connections are in use on {{ $labels.instance }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: MysqlHighPreparedStatementsUtilization(>80%)
      expr: 'max_over_time(mysql_global_status_prepared_stmt_count[1m]) / mysql_global_variables_max_prepared_stmt_count * 100 > 80'
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: MySQL high prepared statements utilization (> 80%) (instance {{ $labels.instance }})
        description: "High utilization of prepared statements (>80%) on {{ $labels.instance }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: MysqlHighThreadsRunning
      expr: 'max_over_time(mysql_global_status_threads_running[1m]) / mysql_global_variables_max_connections * 100 > 60'
      for: 2m
--- a/conf/prometheus/alerts/node-exporter.yml
+++ b/conf/prometheus/alerts/node-exporter.yml
@ -1,8 +1,11 @@
 groups:
-  - name: Node
+
 - name: NodeExporter
  rules:
    - alert: HostOutOfMemory
-      expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10
+      expr: '(node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
      for: 2m
      labels:
        severity: warning
@ -11,7 +14,7 @@ groups:
        description: "Node memory is filling up (< 10% left)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: HostMemoryUnderMemoryPressure
-      expr: rate(node_vmstat_pgmajfault[1m]) > 1000
+      expr: '(rate(node_vmstat_pgmajfault[1m]) > 1000) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
      for: 2m
      labels:
        severity: warning
@ -19,8 +22,17 @@ groups:
        summary: Host memory under memory pressure (instance {{ $labels.instance }})
        description: "The node is under heavy memory pressure. High rate of major page faults\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: HostMemoryIsUnderutilized
      expr: '(100 - (avg_over_time(node_memory_MemAvailable_bytes[30m]) / node_memory_MemTotal_bytes * 100) < 20) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
      for: 1w
      labels:
        severity: info
      annotations:
        summary: Host Memory is underutilized (instance {{ $labels.instance }})
        description: "Node memory is < 20% for 1 week. Consider reducing memory space. (instance {{ $labels.instance }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: HostUnusualNetworkThroughputIn
-      expr: sum by (instance) (rate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100
+      expr: '(sum by (instance) (rate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
      for: 5m
      labels:
        severity: warning
@ -29,7 +41,7 @@ groups:
        description: "Host network interfaces are probably receiving too much data (> 100 MB/s)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: HostUnusualNetworkThroughputOut
-      expr: sum by (instance) (rate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100
+      expr: '(sum by (instance) (rate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
      for: 5m
      labels:
        severity: warning
@ -38,7 +50,7 @@ groups:
        description: "Host network interfaces are probably sending too much data (> 100 MB/s)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: HostUnusualDiskReadRate
-      expr: sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50
+      expr: '(sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
      for: 5m
      labels:
        severity: warning
@ -47,7 +59,7 @@ groups:
        description: "Disk is probably reading too much data (> 50 MB/s)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: HostUnusualDiskWriteRate
-      expr: sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50
+      expr: '(sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
      for: 2m
      labels:
        severity: warning
@ -55,11 +67,8 @@ groups:
        summary: Host unusual disk write rate (instance {{ $labels.instance }})
        description: "Disk is probably writing too much data (> 50 MB/s)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    # Please add ignored mountpoints in node_exporter parameters like
    # "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)".
    # Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users.
    - alert: HostOutOfDiskSpace
-      expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0
+      expr: '((node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
      for: 2m
      labels:
        severity: warning
@ -67,11 +76,8 @@ groups:
        summary: Host out of disk space (instance {{ $labels.instance }})
        description: "Disk is almost full (< 10% left)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    # Please add ignored mountpoints in node_exporter parameters like
    # "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)".
    # Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users.
    - alert: HostDiskWillFillIn24Hours
-      expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly == 0
+      expr: '((node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
      for: 2m
      labels:
        severity: warning
@ -80,7 +86,7 @@ groups:
        description: "Filesystem is predicted to run out of space within the next 24 hours at current write rate\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: HostOutOfInodes
-      expr: node_filesystem_files_free{mountpoint ="/rootfs"} / node_filesystem_files{mountpoint="/rootfs"} * 100 < 10 and ON (instance, device, mountpoint) node_filesystem_readonly{mountpoint="/rootfs"} == 0
+      expr: '(node_filesystem_files_free{fstype!="msdosfs"} / node_filesystem_files{fstype!="msdosfs"} * 100 < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
      for: 2m
      labels:
        severity: warning
@ -88,8 +94,17 @@ groups:
        summary: Host out of inodes (instance {{ $labels.instance }})
        description: "Disk is almost running out of available inodes (< 10% left)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: HostFilesystemDeviceError
      expr: 'node_filesystem_device_error == 1'
      for: 2m
      labels:
        severity: critical
      annotations:
        summary: Host filesystem device error (instance {{ $labels.instance }})
        description: "{{ $labels.instance }}: Device error with the {{ $labels.mountpoint }} filesystem\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: HostInodesWillFillIn24Hours
-      expr: node_filesystem_files_free{mountpoint ="/rootfs"} / node_filesystem_files{mountpoint="/rootfs"} * 100 < 10 and predict_linear(node_filesystem_files_free{mountpoint="/rootfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly{mountpoint="/rootfs"} == 0
+      expr: '(node_filesystem_files_free{fstype!="msdosfs"} / node_filesystem_files{fstype!="msdosfs"} * 100 < 10 and predict_linear(node_filesystem_files_free{fstype!="msdosfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly{fstype!="msdosfs"} == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
      for: 2m
      labels:
        severity: warning
@ -98,7 +113,7 @@ groups:
        description: "Filesystem is predicted to run out of inodes within the next 24 hours at current write rate\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: HostUnusualDiskReadLatency
-      expr: rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0
+      expr: '(rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
      for: 2m
      labels:
        severity: warning
@ -107,25 +122,34 @@ groups:
        description: "Disk latency is growing (read operations > 100ms)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: HostUnusualDiskWriteLatency
-      expr: rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 0.25 and rate(node_disk_writes_completed_total[1m]) > 0
+      expr: '(rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 0.1 and rate(node_disk_writes_completed_total[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: Host unusual disk write latency (instance {{ $labels.instance }})
-        description: "Disk latency is growing (write operations > 250ms)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Disk latency is growing (write operations > 100ms)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: HostHighCpuLoad
-      expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[2m])) * 100) > 80
+      expr: '(sum by (instance) (avg by (mode, instance) (rate(node_cpu_seconds_total{mode!="idle"}[2m]))) > 0.8) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
-      for: 0m
+      for: 10m
      labels:
        severity: warning
      annotations:
        summary: Host high CPU load (instance {{ $labels.instance }})
        description: "CPU load is > 80%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: HostCpuIsUnderutilized
      expr: '(100 - (rate(node_cpu_seconds_total{mode="idle"}[30m]) * 100) < 20) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
      for: 1w
      labels:
        severity: info
      annotations:
        summary: Host CPU is underutilized (instance {{ $labels.instance }})
        description: "CPU load is < 20% for 1 week. Consider reducing the number of CPUs.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: HostCpuStealNoisyNeighbor
-      expr: avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10
+      expr: '(avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
      for: 0m
      labels:
        severity: warning
@ -133,20 +157,38 @@ groups:
        summary: Host CPU steal noisy neighbor (instance {{ $labels.instance }})
        description: "CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-    # 1000 context switches is an arbitrary number.
+    - alert: HostCpuHighIowait
-    # Alert threshold depends on nature of application.
+      expr: '(avg by (instance) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) * 100 > 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
    # Please read: https://github.com/samber/awesome-prometheus-alerts/issues/58
    - alert: HostContextSwitching
      expr: (rate(node_context_switches_total[5m])) / (count without(cpu, mode) (node_cpu_seconds_total{mode="idle"})) > 4000
      for: 0m
      labels:
        severity: warning
      annotations:
-        summary: Host context switching (instance {{ $labels.instance }})
+        summary: Host CPU high iowait (instance {{ $labels.instance }})
-        description: "Context switching is growing on node (> 4000 / s)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "CPU iowait > 10%. A high iowait means that you are disk or network bound.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: HostUnusualDiskIo
      expr: '(rate(node_disk_io_time_seconds_total[1m]) > 0.5) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: Host unusual disk IO (instance {{ $labels.instance }})
        description: "Time spent in IO is too high on {{ $labels.instance }}. Check storage for issues.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: HostContextSwitchingHigh
      expr: '(rate(node_context_switches_total[15m])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"}))
 /
 (rate(node_context_switches_total[1d])/count without(mode,cpu) (node_cpu_seconds_total{mode="idle"})) > 2
 '
      for: 0m
      labels:
        severity: warning
      annotations:
        summary: Host context switching high (instance {{ $labels.instance }})
        description: "Context switching is growing on the node (twice the daily average during the last 15m)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: HostSwapIsFillingUp
-      expr: (1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80
+      expr: '((1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
      for: 2m
      labels:
        severity: warning
@ -155,7 +197,7 @@ groups:
        description: "Swap is filling up (>80%)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: HostSystemdServiceCrashed
-      expr: node_systemd_unit_state{state="failed"} == 1
+      expr: '(node_systemd_unit_state{state="failed"} == 1) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
      for: 0m
      labels:
        severity: warning
@ -164,7 +206,7 @@ groups:
        description: "systemd service crashed\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: HostPhysicalComponentTooHot
-      expr: node_hwmon_temp_celsius > 75
+      expr: '((node_hwmon_temp_celsius * ignoring(label) group_left(instance, job, node, sensor) node_hwmon_sensor_label{label!="tctl"} > 75)) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
      for: 5m
      labels:
        severity: warning
@ -173,7 +215,7 @@ groups:
        description: "Physical hardware component too hot\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: HostNodeOvertemperatureAlarm
-      expr: node_hwmon_temp_crit_alarm_celsius == 1
+      expr: '((node_hwmon_temp_crit_alarm_celsius == 1) or (node_hwmon_temp_alarm == 1)) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
      for: 0m
      labels:
        severity: critical
@ -182,16 +224,16 @@ groups:
        description: "Physical node temperature alarm triggered\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: HostRaidArrayGotInactive
-      expr: node_md_state{state="inactive"} > 0
+      expr: '(node_md_state{state="inactive"} > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: Host RAID array got inactive (instance {{ $labels.instance }})
-        description: "RAID array {{ $labels.device }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "RAID array {{ $labels.device }} is in a degraded state due to one or more disk failures. The number of spare drives is insufficient to fix the issue automatically.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: HostRaidDiskFailure
-      expr: node_md_disks{state="failed"} > 0
+      expr: '(node_md_disks{state="failed"} > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
      for: 2m
      labels:
        severity: warning
@ -199,17 +241,17 @@ groups:
        summary: Host RAID disk failure (instance {{ $labels.instance }})
        description: "At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
-    #- alert: HostKernelVersionDeviations
+    - alert: HostKernelVersionDeviations
-    #  expr: count(sum(label_replace(node_uname_info, "kernel", "$1", "release", "([0-9]+.[0-9]+.[0-9]+).*")) by (kernel)) > 1
+      expr: '(count(sum(label_replace(node_uname_info, "kernel", "$1", "release", "([0-9]+.[0-9]+.[0-9]+).*")) by (kernel)) > 1) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
-    #  for: 6h
+      for: 6h
-    #  labels:
+      labels:
-    #    severity: warning
+        severity: warning
-    #  annotations:
+      annotations:
-    #    summary: Host kernel version deviations (instance {{ $labels.instance }})
+        summary: Host kernel version deviations (instance {{ $labels.instance }})
-    #    description: "Different kernel versions are running\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
+        description: "Different kernel versions are running\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: HostOomKillDetected
-      expr: increase(node_vmstat_oom_kill[1m]) > 0
+      expr: '(increase(node_vmstat_oom_kill[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
      for: 0m
      labels:
        severity: warning
@ -218,7 +260,7 @@ groups:
        description: "OOM kill detected\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: HostEdacCorrectableErrorsDetected
-      expr: increase(node_edac_correctable_errors_total[1m]) > 0
+      expr: '(increase(node_edac_correctable_errors_total[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
      for: 0m
      labels:
        severity: info
@ -227,7 +269,7 @@ groups:
        description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: HostEdacUncorrectableErrorsDetected
-      expr: node_edac_uncorrectable_errors_total > 0
+      expr: '(node_edac_uncorrectable_errors_total > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
      for: 0m
      labels:
        severity: warning
@ -236,7 +278,7 @@ groups:
        description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: HostNetworkReceiveErrors
-      expr: rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01
+      expr: '(rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
      for: 2m
      labels:
        severity: warning
@ -245,7 +287,7 @@ groups:
        description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} receive errors in the last two minutes.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: HostNetworkTransmitErrors
-      expr: rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01
+      expr: '(rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
      for: 2m
      labels:
        severity: warning
@ -254,7 +296,7 @@ groups:
        description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} transmit errors in the last two minutes.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: HostNetworkInterfaceSaturated
-      expr: (rate(node_network_receive_bytes_total{device!~"^tap.*"}[1m]) + rate(node_network_transmit_bytes_total{device!~"^tap.*"}[1m])) / node_network_speed_bytes{device!~"^tap.*"} > 0.8 < 10000
+      expr: '((rate(node_network_receive_bytes_total{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"}[1m]) + rate(node_network_transmit_bytes_total{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"}[1m])) / node_network_speed_bytes{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"} > 0.8 < 10000) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
      for: 1m
      labels:
        severity: warning
@ -263,7 +305,7 @@ groups:
        description: "The network interface \"{{ $labels.device }}\" on \"{{ $labels.instance }}\" is getting overloaded.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: HostNetworkBondDegraded
-      expr: (node_bonding_active - node_bonding_slaves) != 0
+      expr: '((node_bonding_active - node_bonding_slaves) != 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
      for: 2m
      labels:
        severity: warning
@ -272,7 +314,7 @@ groups:
        description: "Bond \"{{ $labels.device }}\" degraded on \"{{ $labels.instance }}\".\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: HostConntrackLimit
-      expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8
+      expr: '(node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
      for: 5m
      labels:
        severity: warning
@ -281,8 +323,8 @@ groups:
        description: "The number of conntrack is approaching limit\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: HostClockSkew
-      expr: (node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0)
+      expr: '((node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0)) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
-      for: 2m
+      for: 10m
      labels:
        severity: warning
      annotations:
@ -290,7 +332,7 @@ groups:
        description: "Clock skew detected. Clock is out of sync. Ensure NTP is configured correctly on this host.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: HostClockNotSynchronising
-      expr: min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16
+      expr: '(min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
      for: 2m
      labels:
        severity: warning
@ -299,7 +341,7 @@ groups:
        description: "Clock not synchronising. Ensure NTP is configured on this host.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: HostRequiresReboot
-      expr: node_reboot_required > 0
+      expr: '(node_reboot_required > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}'
      for: 4h
      labels:
        severity: info
--- a/conf/prometheus/alerts/postgres-exporter.yml
+++ b/conf/prometheus/alerts/postgres-exporter.yml
@ -0,0 +1,194 @@
 groups:
 - name: PostgresExporter
  rules:
    - alert: PostgresqlDown
      expr: 'pg_up == 0'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: Postgresql down (instance {{ $labels.instance }})
        description: "Postgresql instance is down\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: PostgresqlRestarted
      expr: 'time() - pg_postmaster_start_time_seconds < 60'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: Postgresql restarted (instance {{ $labels.instance }})
        description: "Postgresql restarted\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: PostgresqlExporterError
      expr: 'pg_exporter_last_scrape_error > 0'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: Postgresql exporter error (instance {{ $labels.instance }})
        description: "Postgresql exporter is showing errors. A query may be buggy in query.yaml\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: PostgresqlTableNotAutoVacuumed
      expr: '(pg_stat_user_tables_last_autovacuum > 0) and (time() - pg_stat_user_tables_last_autovacuum) > 60 * 60 * 24 * 10'
      for: 0m
      labels:
        severity: warning
      annotations:
        summary: Postgresql table not auto vacuumed (instance {{ $labels.instance }})
        description: "Table {{ $labels.relname }} has not been auto vacuumed for 10 days\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: PostgresqlTableNotAutoAnalyzed
      expr: '(pg_stat_user_tables_last_autoanalyze > 0) and (time() - pg_stat_user_tables_last_autoanalyze) > 24 * 60 * 60 * 10'
      for: 0m
      labels:
        severity: warning
      annotations:
        summary: Postgresql table not auto analyzed (instance {{ $labels.instance }})
        description: "Table {{ $labels.relname }} has not been auto analyzed for 10 days\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: PostgresqlTooManyConnections
      expr: 'sum by (instance, job, server) (pg_stat_activity_count) > min by (instance, job, server) (pg_settings_max_connections * 0.8)'
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: Postgresql too many connections (instance {{ $labels.instance }})
        description: "PostgreSQL instance has too many connections (> 80%).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: PostgresqlNotEnoughConnections
      expr: 'sum by (datname) (pg_stat_activity_count{datname!~"template.*|postgres"}) < 5'
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: Postgresql not enough connections (instance {{ $labels.instance }})
        description: "PostgreSQL instance should have more connections (> 5)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: PostgresqlDeadLocks
      expr: 'increase(pg_stat_database_deadlocks{datname!~"template.*|postgres"}[1m]) > 5'
      for: 0m
      labels:
        severity: warning
      annotations:
        summary: Postgresql dead locks (instance {{ $labels.instance }})
        description: "PostgreSQL has dead-locks\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: PostgresqlHighRollbackRate
      expr: 'sum by (namespace,datname) ((rate(pg_stat_database_xact_rollback{datname!~"template.*|postgres",datid!="0"}[3m])) / ((rate(pg_stat_database_xact_rollback{datname!~"template.*|postgres",datid!="0"}[3m])) + (rate(pg_stat_database_xact_commit{datname!~"template.*|postgres",datid!="0"}[3m])))) > 0.02'
      for: 0m
      labels:
        severity: warning
      annotations:
        summary: Postgresql high rollback rate (instance {{ $labels.instance }})
        description: "Ratio of transactions being aborted compared to committed is > 2 %\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: PostgresqlCommitRateLow
      expr: 'rate(pg_stat_database_xact_commit[1m]) < 10'
      for: 2m
      labels:
        severity: critical
      annotations:
        summary: Postgresql commit rate low (instance {{ $labels.instance }})
        description: "Postgresql seems to be processing very few transactions\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: PostgresqlLowXidConsumption
      expr: 'rate(pg_txid_current[1m]) < 5'
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: Postgresql low XID consumption (instance {{ $labels.instance }})
        description: "Postgresql seems to be consuming transaction IDs very slowly\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: PostgresqlHighRateStatementTimeout
      expr: 'rate(postgresql_errors_total{type="statement_timeout"}[1m]) > 3'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: Postgresql high rate statement timeout (instance {{ $labels.instance }})
        description: "Postgres transactions showing high rate of statement timeouts\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: PostgresqlHighRateDeadlock
      expr: 'increase(postgresql_errors_total{type="deadlock_detected"}[1m]) > 1'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: Postgresql high rate deadlock (instance {{ $labels.instance }})
        description: "Postgres detected deadlocks\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: PostgresqlUnusedReplicationSlot
      expr: 'pg_replication_slots_active == 0'
      for: 1m
      labels:
        severity: warning
      annotations:
        summary: Postgresql unused replication slot (instance {{ $labels.instance }})
        description: "Unused Replication Slots\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: PostgresqlTooManyDeadTuples
      expr: '((pg_stat_user_tables_n_dead_tup > 10000) / (pg_stat_user_tables_n_live_tup + pg_stat_user_tables_n_dead_tup)) >= 0.1'
      for: 2m
      labels:
        severity: warning
      annotations:
        summary: Postgresql too many dead tuples (instance {{ $labels.instance }})
        description: "PostgreSQL dead tuples is too large\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: PostgresqlConfigurationChanged
      expr: '{__name__=~"pg_settings_.*"} != ON(__name__, instance) {__name__=~"pg_settings_([^t]|t[^r]|tr[^a]|tra[^n]|tran[^s]|trans[^a]|transa[^c]|transac[^t]|transact[^i]|transacti[^o]|transactio[^n]|transaction[^_]|transaction_[^r]|transaction_r[^e]|transaction_re[^a]|transaction_rea[^d]|transaction_read[^_]|transaction_read_[^o]|transaction_read_o[^n]|transaction_read_on[^l]|transaction_read_onl[^y]).*"} OFFSET 5m'
      for: 0m
      labels:
        severity: info
      annotations:
        summary: Postgresql configuration changed (instance {{ $labels.instance }})
        description: "Postgres Database configuration change has occurred\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: PostgresqlSslCompressionActive
      expr: 'sum(pg_stat_ssl_compression) > 0'
      for: 0m
      labels:
        severity: critical
      annotations:
        summary: Postgresql SSL compression active (instance {{ $labels.instance }})
        description: "Database connections with SSL compression enabled. This may add significant jitter in replication delay. Replicas should turn off SSL compression via `sslcompression=0` in `recovery.conf`.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: PostgresqlTooManyLocksAcquired
      expr: '((sum (pg_locks_count)) / (pg_settings_max_locks_per_transaction * pg_settings_max_connections)) > 0.20'
      for: 2m
      labels:
        severity: critical
      annotations:
        summary: Postgresql too many locks acquired (instance {{ $labels.instance }})
        description: "Too many locks acquired on the database. If this alert happens frequently, we may need to increase the postgres setting max_locks_per_transaction.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: PostgresqlBloatIndexHigh(>80%)
      expr: 'pg_bloat_btree_bloat_pct > 80 and on (idxname) (pg_bloat_btree_real_size > 100000000)'
      for: 1h
      labels:
        severity: warning
      annotations:
        summary: Postgresql bloat index high (> 80%) (instance {{ $labels.instance }})
        description: "The index {{ $labels.idxname }} is bloated. You should execute `REINDEX INDEX CONCURRENTLY {{ $labels.idxname }};`\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: PostgresqlBloatTableHigh(>80%)
      expr: 'pg_bloat_table_bloat_pct > 80 and on (relname) (pg_bloat_table_real_size > 200000000)'
      for: 1h
      labels:
        severity: warning
      annotations:
        summary: Postgresql bloat table high (> 80%) (instance {{ $labels.instance }})
        description: "The table {{ $labels.relname }} is bloated. You should execute `VACUUM {{ $labels.relname }};`\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
    - alert: PostgresqlInvalidIndex
      expr: 'pg_general_index_info_pg_relation_size{indexrelname=~".*ccnew.*"}'
      for: 6h
      labels:
        severity: warning
      annotations:
        summary: Postgresql invalid index (instance {{ $labels.instance }})
        description: "The table {{ $labels.relname }} has an invalid index: {{ $labels.indexrelname }}. You should execute `DROP INDEX {{ $labels.indexrelname }};`\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
--- a/conf/prometheus/alerts/prometheus.yml
+++ b/conf/prometheus/alerts/prometheus.yml
@ -1,236 +0,0 @@
 groups:
  - name: Prometheus
    rules:
      - alert: PrometheusJobMissing
        expr: absent(up{job="prometheus"})
        for: 0m
        labels:
          severity: warning
        annotations:
          summary: Prometheus job missing (instance {{ $labels.instance }})
          description: "A Prometheus job has disappeared\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
      - alert: PrometheusTargetMissing
        expr: up == 0
        for: 0m
        labels:
          severity: critical
        annotations:
          summary: Prometheus target missing (instance {{ $labels.instance }})
          description: "A Prometheus target has disappeared. An exporter might be crashed.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
      - alert: PrometheusAllTargetsMissing
        expr: count by (job) (up) == 0
        for: 0m
        labels:
          severity: critical
        annotations:
          summary: Prometheus all targets missing (instance {{ $labels.instance }})
          description: "A Prometheus job does not have living target anymore.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
      - alert: PrometheusConfigurationReloadFailure
        expr: prometheus_config_last_reload_successful != 1
        for: 0m
        labels:
          severity: warning
        annotations:
          summary: Prometheus configuration reload failure (instance {{ $labels.instance }})
          description: "Prometheus configuration reload error\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
      - alert: PrometheusTooManyRestarts
        expr: changes(process_start_time_seconds{job=~"prometheus|pushgateway|alertmanager"}[15m]) > 2
        for: 0m
        labels:
          severity: warning
        annotations:
          summary: Prometheus too many restarts (instance {{ $labels.instance }})
          description: "Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
      - alert: PrometheusAlertmanagerJobMissing
        expr: absent(up{job="alertmanager"})
        for: 0m
        labels:
          severity: warning
        annotations:
          summary: Prometheus AlertManager job missing (instance {{ $labels.instance }})
          description: "A Prometheus AlertManager job has disappeared\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
      - alert: PrometheusAlertmanagerConfigurationReloadFailure
        expr: alertmanager_config_last_reload_successful != 1
        for: 0m
        labels:
          severity: warning
        annotations:
          summary: Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }})
          description: "AlertManager configuration reload error\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
      - alert: PrometheusAlertmanagerConfigNotSynced
        expr: count(count_values("config_hash", alertmanager_config_hash)) > 1
        for: 0m
        labels:
          severity: warning
        annotations:
          summary: Prometheus AlertManager config not synced (instance {{ $labels.instance }})
          description: "Configurations of AlertManager cluster instances are out of sync\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
      - alert: PrometheusAlertmanagerE2eDeadManSwitch
        expr: vector(1)
        for: 0m
        labels:
          severity: critical
        annotations:
          summary: Prometheus AlertManager E2E dead man switch (instance {{ $labels.instance }})
          description: "Prometheus DeadManSwitch is an always-firing alert. It's used as an end-to-end test of Prometheus through the Alertmanager.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
      - alert: PrometheusNotConnectedToAlertmanager
        expr: prometheus_notifications_alertmanagers_discovered < 1
        for: 0m
        labels:
          severity: critical
        annotations:
          summary: Prometheus not connected to alertmanager (instance {{ $labels.instance }})
          description: "Prometheus cannot connect the alertmanager\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
      - alert: PrometheusRuleEvaluationFailures
        expr: increase(prometheus_rule_evaluation_failures_total[3m]) > 0
        for: 0m
        labels:
          severity: critical
        annotations:
          summary: Prometheus rule evaluation failures (instance {{ $labels.instance }})
          description: "Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
      - alert: PrometheusTemplateTextExpansionFailures
        expr: increase(prometheus_template_text_expansion_failures_total[3m]) > 0
        for: 0m
        labels:
          severity: critical
        annotations:
          summary: Prometheus template text expansion failures (instance {{ $labels.instance }})
          description: "Prometheus encountered {{ $value }} template text expansion failures\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
      - alert: PrometheusRuleEvaluationSlow
        expr: prometheus_rule_group_last_duration_seconds > prometheus_rule_group_interval_seconds
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: Prometheus rule evaluation slow (instance {{ $labels.instance }})
          description: "Prometheus rule evaluation took more time than the scheduled interval. It indicates a slower storage backend access or too complex query.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
      - alert: PrometheusNotificationsBacklog
        expr: min_over_time(prometheus_notifications_queue_length[10m]) > 0
        for: 0m
        labels:
          severity: warning
        annotations:
          summary: Prometheus notifications backlog (instance {{ $labels.instance }})
          description: "The Prometheus notification queue has not been empty for 10 minutes\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
      - alert: PrometheusAlertmanagerNotificationFailing
        expr: rate(alertmanager_notifications_failed_total[1m]) > 0
        for: 0m
        labels:
          severity: critical
        annotations:
          summary: Prometheus AlertManager notification failing (instance {{ $labels.instance }})
          description: "Alertmanager is failing sending notifications\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
      - alert: PrometheusTargetEmpty
        expr: prometheus_sd_discovered_targets == 0
        for: 0m
        labels:
          severity: critical
        annotations:
          summary: Prometheus target empty (instance {{ $labels.instance }})
          description: "Prometheus has no target in service discovery\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
      - alert: PrometheusTargetScrapingSlow
        expr: prometheus_target_interval_length_seconds{quantile="0.9"} > 60
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: Prometheus target scraping slow (instance {{ $labels.instance }})
          description: "Prometheus is scraping exporters slowly\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
      - alert: PrometheusLargeScrape
        expr: increase(prometheus_target_scrapes_exceeded_sample_limit_total[10m]) > 10
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: Prometheus large scrape (instance {{ $labels.instance }})
          description: "Prometheus has many scrapes that exceed the sample limit\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
      - alert: PrometheusTargetScrapeDuplicate
        expr: increase(prometheus_target_scrapes_sample_duplicate_timestamp_total[5m]) > 0
        for: 0m
        labels:
          severity: warning
        annotations:
          summary: Prometheus target scrape duplicate (instance {{ $labels.instance }})
          description: "Prometheus has many samples rejected due to duplicate timestamps but different values\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
      - alert: PrometheusTsdbCheckpointCreationFailures
        expr: increase(prometheus_tsdb_checkpoint_creations_failed_total[1m]) > 0
        for: 0m
        labels:
          severity: critical
        annotations:
          summary: Prometheus TSDB checkpoint creation failures (instance {{ $labels.instance }})
          description: "Prometheus encountered {{ $value }} checkpoint creation failures\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
      - alert: PrometheusTsdbCheckpointDeletionFailures
        expr: increase(prometheus_tsdb_checkpoint_deletions_failed_total[1m]) > 0
        for: 0m
        labels:
          severity: critical
        annotations:
          summary: Prometheus TSDB checkpoint deletion failures (instance {{ $labels.instance }})
          description: "Prometheus encountered {{ $value }} checkpoint deletion failures\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
      - alert: PrometheusTsdbCompactionsFailed
        expr: increase(prometheus_tsdb_compactions_failed_total[1m]) > 0
        for: 0m
        labels:
          severity: critical
        annotations:
          summary: Prometheus TSDB compactions failed (instance {{ $labels.instance }})
          description: "Prometheus encountered {{ $value }} TSDB compactions failures\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
      - alert: PrometheusTsdbHeadTruncationsFailed
        expr: increase(prometheus_tsdb_head_truncations_failed_total[1m]) > 0
        for: 0m
        labels:
          severity: critical
        annotations:
          summary: Prometheus TSDB head truncations failed (instance {{ $labels.instance }})
          description: "Prometheus encountered {{ $value }} TSDB head truncation failures\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
      - alert: PrometheusTsdbReloadFailures
        expr: increase(prometheus_tsdb_reloads_failures_total[1m]) > 0
        for: 0m
        labels:
          severity: critical
        annotations:
          summary: Prometheus TSDB reload failures (instance {{ $labels.instance }})
          description: "Prometheus encountered {{ $value }} TSDB reload failures\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
      - alert: PrometheusTsdbWalCorruptions
        expr: increase(prometheus_tsdb_wal_corruptions_total[1m]) > 0
        for: 0m
        labels:
          severity: critical
        annotations:
          summary: Prometheus TSDB WAL corruptions (instance {{ $labels.instance }})
          description: "Prometheus encountered {{ $value }} TSDB WAL corruptions\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
      - alert: PrometheusTsdbWalTruncationsFailed
        expr: increase(prometheus_tsdb_wal_truncations_failed_total[1m]) > 0
        for: 0m
        labels:
          severity: critical
        annotations:
          summary: Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }})
          description: "Prometheus encountered {{ $value }} TSDB WAL truncation failures\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"