# Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users.
- alert:HostDiskWillFillIn24Hours
expr:(node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly == 0
for:2m
labels:
severity:warning
annotations:
summary:Host disk will fill in 24 hours (instance {{ $labels.instance }})
description:"Filesystem is predicted to run out of space within the next 24 hours at current write rate\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert:HostOutOfInodes
expr:node_filesystem_files_free{mountpoint ="/rootfs"} / node_filesystem_files{mountpoint="/rootfs"} * 100 < 10 and ON (instance, device, mountpoint) node_filesystem_readonly{mountpoint="/rootfs"} == 0
for:2m
labels:
severity:warning
annotations:
summary:Host out of inodes (instance {{ $labels.instance }})
description:"Disk is almost running out of available inodes (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert:HostInodesWillFillIn24Hours
expr:node_filesystem_files_free{mountpoint ="/rootfs"} / node_filesystem_files{mountpoint="/rootfs"} * 100 < 10 and predict_linear(node_filesystem_files_free{mountpoint="/rootfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly{mountpoint="/rootfs"} == 0
for:2m
labels:
severity:warning
annotations:
summary:Host inodes will fill in 24 hours (instance {{ $labels.instance }})
description:"Filesystem is predicted to run out of inodes within the next 24 hours at current write rate\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert:HostUnusualDiskReadLatency
expr:rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0
for:2m
labels:
severity:warning
annotations:
summary:Host unusual disk read latency (instance {{ $labels.instance }})
description:"Disk latency is growing (read operations > 100ms)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert:HostUnusualDiskWriteLatency
expr:rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 0.1 and rate(node_disk_writes_completed_total[1m]) > 0
for:2m
labels:
severity:warning
annotations:
summary:Host unusual disk write latency (instance {{ $labels.instance }})
description:"Disk latency is growing (write operations > 100ms)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
summary:Host CPU steal noisy neighbor (instance {{ $labels.instance }})
description:"CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# 1000 context switches is an arbitrary number.
# Alert threshold depends on nature of application.
description:"RAID array {{ $labels.device }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert:HostRaidDiskFailure
expr:node_md_disks{state="failed"} > 0
for:2m
labels:
severity:warning
annotations:
summary:Host RAID disk failure (instance {{ $labels.instance }})
description:"At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description:"Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description:"Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description:"The number of conntrack is approaching limit\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert:HostClockSkew
expr:(node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0)
description:"Clock skew detected. Clock is out of sync. Ensure NTP is configured correctly on this host.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert:HostClockNotSynchronising
expr:min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16
for:2m
labels:
severity:warning
annotations:
summary:Host clock not synchronising (instance {{ $labels.instance }})
description:"Clock not synchronising. Ensure NTP is configured on this host.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"