This commit is contained in:
parent
8e94b34e63
commit
f7d73a0df5
10 changed files with 795 additions and 499 deletions
77
conf/prometheus/alerts/google-cadvisor.yml
Normal file
77
conf/prometheus/alerts/google-cadvisor.yml
Normal file
|
@ -0,0 +1,77 @@
|
|||
groups:
|
||||
|
||||
- name: GoogleCadvisor
|
||||
|
||||
rules:
|
||||
|
||||
- alert: ContainerKilled
|
||||
expr: 'time() - container_last_seen > 60'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Container killed (instance {{ $labels.instance }})
|
||||
description: "A container has disappeared\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: ContainerAbsent
|
||||
expr: 'absent(container_last_seen)'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Container absent (instance {{ $labels.instance }})
|
||||
description: "A container is absent for 5 min\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: ContainerHighCpuUtilization
|
||||
expr: '(sum(rate(container_cpu_usage_seconds_total{container!=""}[5m])) by (pod, container) / sum(container_spec_cpu_quota{container!=""}/container_spec_cpu_period{container!=""}) by (pod, container) * 100) > 80'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Container High CPU utilization (instance {{ $labels.instance }})
|
||||
description: "Container CPU utilization is above 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: ContainerHighMemoryUsage
|
||||
expr: '(sum(container_memory_working_set_bytes{name!=""}) BY (instance, name) / sum(container_spec_memory_limit_bytes > 0) BY (instance, name) * 100) > 80'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Container High Memory usage (instance {{ $labels.instance }})
|
||||
description: "Container Memory usage is above 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: ContainerVolumeUsage
|
||||
expr: '(1 - (sum(container_fs_inodes_free{name!=""}) BY (instance) / sum(container_fs_inodes_total) BY (instance))) * 100 > 80'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Container Volume usage (instance {{ $labels.instance }})
|
||||
description: "Container Volume usage is above 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: ContainerHighThrottleRate
|
||||
expr: 'sum(increase(container_cpu_cfs_throttled_periods_total{container!=""}[5m])) by (container, pod, namespace) / sum(increase(container_cpu_cfs_periods_total[5m])) by (container, pod, namespace) > ( 25 / 100 )'
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Container high throttle rate (instance {{ $labels.instance }})
|
||||
description: "Container is being throttled\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: ContainerLowCpuUtilization
|
||||
expr: '(sum(rate(container_cpu_usage_seconds_total{container!=""}[5m])) by (pod, container) / sum(container_spec_cpu_quota{container!=""}/container_spec_cpu_period{container!=""}) by (pod, container) * 100) < 20'
|
||||
for: 7d
|
||||
labels:
|
||||
severity: info
|
||||
annotations:
|
||||
summary: Container Low CPU utilization (instance {{ $labels.instance }})
|
||||
description: "Container CPU utilization is under 20% for 1 week. Consider reducing the allocated CPU.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: ContainerLowMemoryUsage
|
||||
expr: '(sum(container_memory_working_set_bytes{name!=""}) BY (instance, name) / sum(container_spec_memory_limit_bytes > 0) BY (instance, name) * 100) < 20'
|
||||
for: 7d
|
||||
labels:
|
||||
severity: info
|
||||
annotations:
|
||||
summary: Container Low Memory usage (instance {{ $labels.instance }})
|
||||
description: "Container Memory usage is under 20% for 1 week. Consider reducing the allocated memory.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
Loading…
Add table
Add a link
Reference in a new issue