fix(prometheus): tune alert probe duration

This commit is contained in:
nyyu 2022-04-24 10:32:45 +02:00
parent aa7085082a
commit 40d9875388

View File

@ -11,13 +11,13 @@ groups:
description: "Probe failed\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "Probe failed\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: BlackboxSlowProbe - alert: BlackboxSlowProbe
expr: avg_over_time(probe_duration_seconds[1m]) > 5 expr: avg_over_time(probe_duration_seconds[1m]) > 2
for: 1m for: 1m
labels: labels:
severity: warning severity: warning
annotations: annotations:
summary: Blackbox slow probe (instance {{ $labels.instance }}) summary: Blackbox slow probe (instance {{ $labels.instance }})
description: "Blackbox probe took more than 5s to complete\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "Blackbox probe took more than 2s to complete\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: BlackboxProbeHttpFailure - alert: BlackboxProbeHttpFailure
expr: probe_http_status_code <= 199 OR probe_http_status_code >= 400 expr: probe_http_status_code <= 199 OR probe_http_status_code >= 400
@ -56,19 +56,19 @@ groups:
description: "SSL certificate has expired already\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "SSL certificate has expired already\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: BlackboxProbeSlowHttp - alert: BlackboxProbeSlowHttp
expr: avg_over_time(probe_http_duration_seconds[1m]) > 1 expr: avg_over_time(probe_http_duration_seconds[1m]) > 2
for: 1m for: 1m
labels: labels:
severity: warning severity: warning
annotations: annotations:
summary: Blackbox probe slow HTTP (instance {{ $labels.instance }}) summary: Blackbox probe slow HTTP (instance {{ $labels.instance }})
description: "HTTP request took more than 1s\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "HTTP request took more than 2s\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: BlackboxProbeSlowPing - alert: BlackboxProbeSlowPing
expr: avg_over_time(probe_icmp_duration_seconds[1m]) > 1 expr: avg_over_time(probe_icmp_duration_seconds[1m]) > 2
for: 1m for: 1m
labels: labels:
severity: warning severity: warning
annotations: annotations:
summary: Blackbox probe slow ping (instance {{ $labels.instance }}) summary: Blackbox probe slow ping (instance {{ $labels.instance }})
description: "Blackbox ping took more than 1s\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" description: "Blackbox ping took more than 2s\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"