init
This commit is contained in:
commit
1ac93ba11b
16 changed files with 17834 additions and 0 deletions
3
.gitignore
vendored
Normal file
3
.gitignore
vendored
Normal file
|
@ -0,0 +1,3 @@
|
|||
.env
|
||||
alertmanager.yml
|
||||
data
|
26
README.md
Normal file
26
README.md
Normal file
|
@ -0,0 +1,26 @@
|
|||
# Docker monitoring stack
|
||||
|
||||
## Includes
|
||||
* prometheus
|
||||
* grafana
|
||||
* alertmanager
|
||||
* node-exporter
|
||||
* blackbox-exporter
|
||||
* cadvisor
|
||||
|
||||
## Prerequisites
|
||||
* docker
|
||||
* docker compose
|
||||
|
||||
## Steps
|
||||
|
||||
1. Create .env file
|
||||
> EXTERNAL_IP=XXX.XXX.XXX.XXX
|
||||
>
|
||||
> GRAFANA_PASSWORD=SECURE_PASSWORD
|
||||
|
||||
2. Copy alertmanager.tmpl to alertmanager.yml
|
||||
* Set telegram bot_token and chat_id
|
||||
|
||||
3. Deploy the stack
|
||||
> docker compose up -d
|
17
conf/alertmanager/alertmanager.tmpl
Normal file
17
conf/alertmanager/alertmanager.tmpl
Normal file
|
@ -0,0 +1,17 @@
|
|||
global:
|
||||
templates:
|
||||
- /etc/alertmanager/templates/*.tmpl
|
||||
route:
|
||||
group_wait: 10s
|
||||
group_interval: 30s
|
||||
repeat_interval: 30m
|
||||
group_by: [alertname, instance]
|
||||
receiver: telegram
|
||||
receivers:
|
||||
- name: telegram
|
||||
telegram_configs:
|
||||
- bot_token: BOT_TOKEN
|
||||
chat_id: CHAT_ID
|
||||
api_url: https://api.telegram.org
|
||||
parse_mode: 'HTML'
|
||||
message: '{{ template "telegram.custom.message" .}}'
|
11
conf/alertmanager/templates/telegram.tmpl
Normal file
11
conf/alertmanager/templates/telegram.tmpl
Normal file
|
@ -0,0 +1,11 @@
|
|||
{{ define "telegram.custom.message" }}
|
||||
{{ range .Alerts }}
|
||||
{{ if eq .Status "firing"}}🔥<b>{{ .Labels.alertname }}</b>🔥{{ else }}👌<b>{{ .Labels.alertname }}</b>👌{{ end }}
|
||||
<b>Labels:</b>{{ range $key, $value := .Labels }}{{ if ne $key "alertname" }}
|
||||
- {{ $key }}: {{ $value }}{{ end }}{{ end }}
|
||||
<b>Annotations:</b>{{ range $key, $value := .Annotations }}
|
||||
- {{ $key }}: {{ reReplaceAll "(?s)LABELS = (.*)" "" $value }}{{ end }}
|
||||
<b>Start:</b> {{ .StartsAt }}{{ if eq .Status "resolved"}}
|
||||
<b>Ended:</b> {{ .EndsAt }}{{ end }}
|
||||
{{ end }}
|
||||
{{ end }}
|
14116
conf/grafana/provisioning/dashboards/Node Exporter Full.json
Normal file
14116
conf/grafana/provisioning/dashboards/Node Exporter Full.json
Normal file
File diff suppressed because it is too large
Load diff
1383
conf/grafana/provisioning/dashboards/blackbox.json
Normal file
1383
conf/grafana/provisioning/dashboards/blackbox.json
Normal file
File diff suppressed because it is too large
Load diff
11
conf/grafana/provisioning/dashboards/dashboard.yml
Normal file
11
conf/grafana/provisioning/dashboards/dashboard.yml
Normal file
|
@ -0,0 +1,11 @@
|
|||
apiVersion: 1
|
||||
|
||||
providers:
|
||||
- name: 'General'
|
||||
orgId: 1
|
||||
folder: ''
|
||||
type: file
|
||||
disableDeletion: false
|
||||
editable: true
|
||||
options:
|
||||
path: /etc/grafana/provisioning/dashboards
|
1103
conf/grafana/provisioning/dashboards/docker.json
Normal file
1103
conf/grafana/provisioning/dashboards/docker.json
Normal file
File diff suppressed because it is too large
Load diff
230
conf/grafana/provisioning/dashboards/smokeping.json.bak
Normal file
230
conf/grafana/provisioning/dashboards/smokeping.json.bak
Normal file
|
@ -0,0 +1,230 @@
|
|||
{
|
||||
"__inputs": [
|
||||
{
|
||||
"name": "DS_PROMETHEUS",
|
||||
"label": "Prometheus",
|
||||
"description": "",
|
||||
"type": "datasource",
|
||||
"pluginId": "prometheus",
|
||||
"pluginName": "Prometheus"
|
||||
}
|
||||
],
|
||||
"__requires": [
|
||||
{
|
||||
"type": "grafana",
|
||||
"id": "grafana",
|
||||
"name": "Grafana",
|
||||
"version": "7.3.7"
|
||||
},
|
||||
{
|
||||
"type": "panel",
|
||||
"id": "heatmap",
|
||||
"name": "Heatmap",
|
||||
"version": ""
|
||||
},
|
||||
{
|
||||
"type": "datasource",
|
||||
"id": "prometheus",
|
||||
"name": "Prometheus",
|
||||
"version": "1.0.0"
|
||||
}
|
||||
],
|
||||
"annotations": {
|
||||
"list": [
|
||||
{
|
||||
"builtIn": 1,
|
||||
"datasource": "-- Grafana --",
|
||||
"enable": true,
|
||||
"hide": true,
|
||||
"iconColor": "rgba(0, 211, 255, 1)",
|
||||
"name": "Annotations & Alerts",
|
||||
"type": "dashboard"
|
||||
}
|
||||
]
|
||||
},
|
||||
"editable": true,
|
||||
"fiscalYearStartMonth": 0,
|
||||
"gnetId": null,
|
||||
"graphTooltip": 0,
|
||||
"id": 2,
|
||||
"iteration": 1650541706979,
|
||||
"links": [],
|
||||
"liveNow": false,
|
||||
"panels": [
|
||||
{
|
||||
"cards": {
|
||||
"cardPadding": null,
|
||||
"cardRound": null
|
||||
},
|
||||
"color": {
|
||||
"cardColor": "#b4ff00",
|
||||
"colorScale": "sqrt",
|
||||
"colorScheme": "interpolateOranges",
|
||||
"exponent": 0.5,
|
||||
"mode": "opacity"
|
||||
},
|
||||
"dataFormat": "tsbuckets",
|
||||
"datasource": "${DS_PROMETHEUS}",
|
||||
"gridPos": {
|
||||
"h": 17,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 0
|
||||
},
|
||||
"heatmap": {},
|
||||
"hideZeroBuckets": false,
|
||||
"highlightCards": true,
|
||||
"id": 2,
|
||||
"legend": {
|
||||
"show": false
|
||||
},
|
||||
"links": [],
|
||||
"reverseYBuckets": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum(rate(smokeping_response_duration_seconds_bucket{instance=~\"$prober\",host=\"$target\"}[5m])) by (le)",
|
||||
"format": "heatmap",
|
||||
"intervalFactor": 1,
|
||||
"legendFormat": "{{le}}",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Smokeping",
|
||||
"tooltip": {
|
||||
"show": true,
|
||||
"showHistogram": false
|
||||
},
|
||||
"type": "heatmap",
|
||||
"xAxis": {
|
||||
"show": true
|
||||
},
|
||||
"xBucketNumber": null,
|
||||
"xBucketSize": null,
|
||||
"yAxis": {
|
||||
"decimals": 0,
|
||||
"format": "s",
|
||||
"logBase": 1,
|
||||
"max": null,
|
||||
"min": "0",
|
||||
"show": true,
|
||||
"splitFactor": null
|
||||
},
|
||||
"yBucketBound": "auto",
|
||||
"yBucketNumber": null,
|
||||
"yBucketSize": null
|
||||
}
|
||||
],
|
||||
"refresh": "30s",
|
||||
"schemaVersion": 32,
|
||||
"style": "dark",
|
||||
"tags": [],
|
||||
"templating": {
|
||||
"list": [
|
||||
{
|
||||
"allValue": null,
|
||||
"current": {},
|
||||
"datasource": "${DS_PROMETHEUS}",
|
||||
"definition": "",
|
||||
"description": null,
|
||||
"error": null,
|
||||
"hide": 0,
|
||||
"includeAll": false,
|
||||
"label": null,
|
||||
"multi": true,
|
||||
"name": "prober",
|
||||
"options": [],
|
||||
"query": {
|
||||
"query": "label_values(smokeping_prober_build_info, instance)",
|
||||
"refId": "Prometheus-prober-Variable-Query"
|
||||
},
|
||||
"refresh": 1,
|
||||
"regex": "",
|
||||
"skipUrlSync": false,
|
||||
"sort": 1,
|
||||
"tagValuesQuery": "",
|
||||
"tagsQuery": "",
|
||||
"type": "query",
|
||||
"useTags": false
|
||||
},
|
||||
{
|
||||
"allValue": null,
|
||||
"current": {},
|
||||
"datasource": "${DS_PROMETHEUS}",
|
||||
"definition": "label_values(smokeping_response_duration_seconds_bucket, host)",
|
||||
"description": null,
|
||||
"error": null,
|
||||
"hide": 0,
|
||||
"includeAll": false,
|
||||
"label": null,
|
||||
"multi": false,
|
||||
"name": "target",
|
||||
"options": [],
|
||||
"query": {
|
||||
"query": "label_values(smokeping_response_duration_seconds_bucket, host)",
|
||||
"refId": "Prometheus-target-Variable-Query"
|
||||
},
|
||||
"refresh": 1,
|
||||
"regex": "",
|
||||
"skipUrlSync": false,
|
||||
"sort": 1,
|
||||
"tagValuesQuery": "",
|
||||
"tagsQuery": "",
|
||||
"type": "query",
|
||||
"useTags": false
|
||||
},
|
||||
{
|
||||
"current": {
|
||||
"selected": false,
|
||||
"text": "Prometheus",
|
||||
"value": "Prometheus"
|
||||
},
|
||||
"description": null,
|
||||
"error": null,
|
||||
"hide": 2,
|
||||
"includeAll": false,
|
||||
"label": null,
|
||||
"multi": false,
|
||||
"name": "DS_PROMETHEUS",
|
||||
"options": [],
|
||||
"query": "prometheus",
|
||||
"refresh": 1,
|
||||
"regex": "",
|
||||
"skipUrlSync": false,
|
||||
"type": "datasource"
|
||||
}
|
||||
]
|
||||
},
|
||||
"time": {
|
||||
"from": "now-1h",
|
||||
"to": "now"
|
||||
},
|
||||
"timepicker": {
|
||||
"refresh_intervals": [
|
||||
"5s",
|
||||
"10s",
|
||||
"30s",
|
||||
"1m",
|
||||
"5m",
|
||||
"15m",
|
||||
"30m",
|
||||
"1h",
|
||||
"2h",
|
||||
"1d"
|
||||
],
|
||||
"time_options": [
|
||||
"5m",
|
||||
"15m",
|
||||
"1h",
|
||||
"6h",
|
||||
"12h",
|
||||
"24h",
|
||||
"2d",
|
||||
"7d",
|
||||
"30d"
|
||||
]
|
||||
},
|
||||
"timezone": "browser",
|
||||
"title": "Smokeping",
|
||||
"uid": "i5aRaLaik",
|
||||
"version": 1
|
||||
}
|
50
conf/grafana/provisioning/datasources/datasource.yml
Normal file
50
conf/grafana/provisioning/datasources/datasource.yml
Normal file
|
@ -0,0 +1,50 @@
|
|||
# config file version
|
||||
apiVersion: 1
|
||||
|
||||
# list of datasources that should be deleted from the database
|
||||
deleteDatasources:
|
||||
- name: Prometheus
|
||||
orgId: 1
|
||||
|
||||
# list of datasources to insert/update depending
|
||||
# whats available in the database
|
||||
datasources:
|
||||
# <string, required> name of the datasource. Required
|
||||
- name: Prometheus
|
||||
# <string, required> datasource type. Required
|
||||
type: prometheus
|
||||
# <string, required> access mode. direct or proxy. Required
|
||||
access: proxy
|
||||
# <int> org id. will default to orgId 1 if not specified
|
||||
orgId: 1
|
||||
# <string> url
|
||||
url: http://prometheus:9090
|
||||
# <string> database password, if used
|
||||
password:
|
||||
# <string> database user, if used
|
||||
user:
|
||||
# <string> database name, if used
|
||||
database:
|
||||
# <bool> enable/disable basic auth
|
||||
basicAuth: false
|
||||
# <string> basic auth username, if used
|
||||
basicAuthUser:
|
||||
# <string> basic auth password, if used
|
||||
basicAuthPassword:
|
||||
# <bool> enable/disable with credentials headers
|
||||
withCredentials:
|
||||
# <bool> mark as default datasource. Max one per org
|
||||
isDefault: true
|
||||
# <map> fields that will be converted to json and stored in json_data
|
||||
jsonData:
|
||||
graphiteVersion: "1.1"
|
||||
tlsAuth: false
|
||||
tlsAuthWithCACert: false
|
||||
# <string> json object of data that will be encrypted.
|
||||
secureJsonData:
|
||||
tlsCACert: "..."
|
||||
tlsClientCert: "..."
|
||||
tlsClientKey: "..."
|
||||
version: 1
|
||||
# <bool> allow users to edit datasources from the UI.
|
||||
editable: true
|
74
conf/prometheus/alerts/blackbox.yml
Normal file
74
conf/prometheus/alerts/blackbox.yml
Normal file
|
@ -0,0 +1,74 @@
|
|||
groups:
|
||||
- name: blackbox
|
||||
rules:
|
||||
- alert: BlackboxProbeFailed
|
||||
expr: probe_success == 0
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Blackbox probe failed (instance {{ $labels.instance }})
|
||||
description: "Probe failed\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: BlackboxSlowProbe
|
||||
expr: avg_over_time(probe_duration_seconds[1m]) > 5
|
||||
for: 1m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Blackbox slow probe (instance {{ $labels.instance }})
|
||||
description: "Blackbox probe took more than 5s to complete\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: BlackboxProbeHttpFailure
|
||||
expr: probe_http_status_code <= 199 OR probe_http_status_code >= 400
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Blackbox probe HTTP failure (instance {{ $labels.instance }})
|
||||
description: "HTTP status code is not 200-399\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: BlackboxSslCertificateWillExpireSoon
|
||||
expr: probe_ssl_earliest_cert_expiry - time() < 86400 * 30
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Blackbox SSL certificate will expire soon (instance {{ $labels.instance }})
|
||||
description: "SSL certificate expires in 30 days\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: BlackboxSslCertificateWillExpireSoon
|
||||
expr: probe_ssl_earliest_cert_expiry - time() < 86400 * 3
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Blackbox SSL certificate will expire soon (instance {{ $labels.instance }})
|
||||
description: "SSL certificate expires in 3 days\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: BlackboxSslCertificateExpired
|
||||
expr: probe_ssl_earliest_cert_expiry - time() <= 0
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Blackbox SSL certificate expired (instance {{ $labels.instance }})
|
||||
description: "SSL certificate has expired already\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: BlackboxProbeSlowHttp
|
||||
expr: avg_over_time(probe_http_duration_seconds[1m]) > 1
|
||||
for: 1m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Blackbox probe slow HTTP (instance {{ $labels.instance }})
|
||||
description: "HTTP request took more than 1s\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: BlackboxProbeSlowPing
|
||||
expr: avg_over_time(probe_icmp_duration_seconds[1m]) > 1
|
||||
for: 1m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Blackbox probe slow ping (instance {{ $labels.instance }})
|
||||
description: "Blackbox ping took more than 1s\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
68
conf/prometheus/alerts/cadvisor.yml
Normal file
68
conf/prometheus/alerts/cadvisor.yml
Normal file
|
@ -0,0 +1,68 @@
|
|||
groups:
|
||||
- name: cadvisor
|
||||
rules:
|
||||
# This rule can be very noisy in dynamic infra with legitimate container start/stop/deployment.
|
||||
- alert: ContainerKilled
|
||||
expr: time() - container_last_seen > 60
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Container killed (instance {{ $labels.instance }})
|
||||
description: "A container has disappeared\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# This rule can be very noisy in dynamic infra with legitimate container start/stop/deployment.
|
||||
- alert: ContainerAbsent
|
||||
expr: absent(container_last_seen)
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Container absent (instance {{ $labels.instance }})
|
||||
description: "A container is absent for 5 min\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: ContainerCpuUsage
|
||||
expr: (sum(rate(container_cpu_usage_seconds_total{name!=""}[3m])) BY (instance, name) * 100) > 80
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Container CPU usage (instance {{ $labels.instance }})
|
||||
description: "Container CPU usage is above 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# See https://medium.com/faun/how-much-is-too-much-the-linux-oomkiller-and-used-memory-d32186f29c9d
|
||||
- alert: ContainerMemoryUsage
|
||||
expr: (sum(container_memory_working_set_bytes{name!=""}) BY (instance, name) / sum(container_spec_memory_limit_bytes > 0) BY (instance, name) * 100) > 80
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Container Memory usage (instance {{ $labels.instance }})
|
||||
description: "Container Memory usage is above 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: ContainerVolumeUsage
|
||||
expr: (1 - (sum(container_fs_inodes_free{name!=""}) BY (instance) / sum(container_fs_inodes_total) BY (instance))) * 100 > 80
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Container Volume usage (instance {{ $labels.instance }})
|
||||
description: "Container Volume usage is above 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: ContainerVolumeIoUsage
|
||||
expr: (sum(container_fs_io_current{name!=""}) BY (instance, name) * 100) > 80
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Container Volume IO usage (instance {{ $labels.instance }})
|
||||
description: "Container Volume IO usage is above 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: ContainerHighThrottleRate
|
||||
expr: rate(container_cpu_cfs_throttled_seconds_total[3m]) > 1
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Container high throttle rate (instance {{ $labels.instance }})
|
||||
description: "Container is being throttled\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
308
conf/prometheus/alerts/node.yml
Normal file
308
conf/prometheus/alerts/node.yml
Normal file
|
@ -0,0 +1,308 @@
|
|||
groups:
|
||||
- name: Node
|
||||
rules:
|
||||
- alert: HostOutOfMemory
|
||||
expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host out of memory (instance {{ $labels.instance }})
|
||||
description: "Node memory is filling up (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostMemoryUnderMemoryPressure
|
||||
expr: rate(node_vmstat_pgmajfault[1m]) > 1000
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host memory under memory pressure (instance {{ $labels.instance }})
|
||||
description: "The node is under heavy memory pressure. High rate of major page faults\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostUnusualNetworkThroughputIn
|
||||
expr: sum by (instance) (rate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host unusual network throughput in (instance {{ $labels.instance }})
|
||||
description: "Host network interfaces are probably receiving too much data (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostUnusualNetworkThroughputOut
|
||||
expr: sum by (instance) (rate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host unusual network throughput out (instance {{ $labels.instance }})
|
||||
description: "Host network interfaces are probably sending too much data (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostUnusualDiskReadRate
|
||||
expr: sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host unusual disk read rate (instance {{ $labels.instance }})
|
||||
description: "Disk is probably reading too much data (> 50 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostUnusualDiskWriteRate
|
||||
expr: sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host unusual disk write rate (instance {{ $labels.instance }})
|
||||
description: "Disk is probably writing too much data (> 50 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Please add ignored mountpoints in node_exporter parameters like
|
||||
# "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)".
|
||||
# Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users.
|
||||
- alert: HostOutOfDiskSpace
|
||||
expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host out of disk space (instance {{ $labels.instance }})
|
||||
description: "Disk is almost full (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# Please add ignored mountpoints in node_exporter parameters like
|
||||
# "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)".
|
||||
# Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users.
|
||||
- alert: HostDiskWillFillIn24Hours
|
||||
expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly == 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host disk will fill in 24 hours (instance {{ $labels.instance }})
|
||||
description: "Filesystem is predicted to run out of space within the next 24 hours at current write rate\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostOutOfInodes
|
||||
expr: node_filesystem_files_free{mountpoint ="/rootfs"} / node_filesystem_files{mountpoint="/rootfs"} * 100 < 10 and ON (instance, device, mountpoint) node_filesystem_readonly{mountpoint="/rootfs"} == 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host out of inodes (instance {{ $labels.instance }})
|
||||
description: "Disk is almost running out of available inodes (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostInodesWillFillIn24Hours
|
||||
expr: node_filesystem_files_free{mountpoint ="/rootfs"} / node_filesystem_files{mountpoint="/rootfs"} * 100 < 10 and predict_linear(node_filesystem_files_free{mountpoint="/rootfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly{mountpoint="/rootfs"} == 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host inodes will fill in 24 hours (instance {{ $labels.instance }})
|
||||
description: "Filesystem is predicted to run out of inodes within the next 24 hours at current write rate\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostUnusualDiskReadLatency
|
||||
expr: rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host unusual disk read latency (instance {{ $labels.instance }})
|
||||
description: "Disk latency is growing (read operations > 100ms)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostUnusualDiskWriteLatency
|
||||
expr: rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 0.1 and rate(node_disk_writes_completed_total[1m]) > 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host unusual disk write latency (instance {{ $labels.instance }})
|
||||
description: "Disk latency is growing (write operations > 100ms)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostHighCpuLoad
|
||||
expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[2m])) * 100) > 80
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host high CPU load (instance {{ $labels.instance }})
|
||||
description: "CPU load is > 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostCpuStealNoisyNeighbor
|
||||
expr: avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host CPU steal noisy neighbor (instance {{ $labels.instance }})
|
||||
description: "CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
# 1000 context switches is an arbitrary number.
|
||||
# Alert threshold depends on nature of application.
|
||||
# Please read: https://github.com/samber/awesome-prometheus-alerts/issues/58
|
||||
- alert: HostContextSwitching
|
||||
expr: (rate(node_context_switches_total[5m])) / (count without(cpu, mode) (node_cpu_seconds_total{mode="idle"})) > 2000
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host context switching (instance {{ $labels.instance }})
|
||||
description: "Context switching is growing on node (> 2000 / s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostSwapIsFillingUp
|
||||
expr: (1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host swap is filling up (instance {{ $labels.instance }})
|
||||
description: "Swap is filling up (>80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostSystemdServiceCrashed
|
||||
expr: node_systemd_unit_state{state="failed"} == 1
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host systemd service crashed (instance {{ $labels.instance }})
|
||||
description: "systemd service crashed\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostPhysicalComponentTooHot
|
||||
expr: node_hwmon_temp_celsius > 75
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host physical component too hot (instance {{ $labels.instance }})
|
||||
description: "Physical hardware component too hot\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostNodeOvertemperatureAlarm
|
||||
expr: node_hwmon_temp_crit_alarm_celsius == 1
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Host node overtemperature alarm (instance {{ $labels.instance }})
|
||||
description: "Physical node temperature alarm triggered\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostRaidArrayGotInactive
|
||||
expr: node_md_state{state="inactive"} > 0
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Host RAID array got inactive (instance {{ $labels.instance }})
|
||||
description: "RAID array {{ $labels.device }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostRaidDiskFailure
|
||||
expr: node_md_disks{state="failed"} > 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host RAID disk failure (instance {{ $labels.instance }})
|
||||
description: "At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
#- alert: HostKernelVersionDeviations
|
||||
# expr: count(sum(label_replace(node_uname_info, "kernel", "$1", "release", "([0-9]+.[0-9]+.[0-9]+).*")) by (kernel)) > 1
|
||||
# for: 6h
|
||||
# labels:
|
||||
# severity: warning
|
||||
# annotations:
|
||||
# summary: Host kernel version deviations (instance {{ $labels.instance }})
|
||||
# description: "Different kernel versions are running\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostOomKillDetected
|
||||
expr: increase(node_vmstat_oom_kill[1m]) > 0
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host OOM kill detected (instance {{ $labels.instance }})
|
||||
description: "OOM kill detected\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostEdacCorrectableErrorsDetected
|
||||
expr: increase(node_edac_correctable_errors_total[1m]) > 0
|
||||
for: 0m
|
||||
labels:
|
||||
severity: info
|
||||
annotations:
|
||||
summary: Host EDAC Correctable Errors detected (instance {{ $labels.instance }})
|
||||
description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostEdacUncorrectableErrorsDetected
|
||||
expr: node_edac_uncorrectable_errors_total > 0
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }})
|
||||
description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostNetworkReceiveErrors
|
||||
expr: rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host Network Receive Errors (instance {{ $labels.instance }})
|
||||
description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} receive errors in the last two minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostNetworkTransmitErrors
|
||||
expr: rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host Network Transmit Errors (instance {{ $labels.instance }})
|
||||
description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} transmit errors in the last two minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostNetworkInterfaceSaturated
|
||||
expr: (rate(node_network_receive_bytes_total{device!~"^tap.*"}[1m]) + rate(node_network_transmit_bytes_total{device!~"^tap.*"}[1m])) / node_network_speed_bytes{device!~"^tap.*"} > 0.8 < 10000
|
||||
for: 1m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host Network Interface Saturated (instance {{ $labels.instance }})
|
||||
description: "The network interface \"{{ $labels.device }}\" on \"{{ $labels.instance }}\" is getting overloaded.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostNetworkBondDegraded
|
||||
expr: (node_bonding_active - node_bonding_slaves) != 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host Network Bond Degraded (instance {{ $labels.instance }})
|
||||
description: "Bond \"{{ $labels.device }}\" degraded on \"{{ $labels.instance }}\".\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostConntrackLimit
|
||||
expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host conntrack limit (instance {{ $labels.instance }})
|
||||
description: "The number of conntrack is approaching limit\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostClockSkew
|
||||
expr: (node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0)
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host clock skew (instance {{ $labels.instance }})
|
||||
description: "Clock skew detected. Clock is out of sync. Ensure NTP is configured correctly on this host.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostClockNotSynchronising
|
||||
expr: min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host clock not synchronising (instance {{ $labels.instance }})
|
||||
description: "Clock not synchronising. Ensure NTP is configured on this host.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: HostRequiresReboot
|
||||
expr: node_reboot_required > 0
|
||||
for: 4h
|
||||
labels:
|
||||
severity: info
|
||||
annotations:
|
||||
summary: Host requires reboot (instance {{ $labels.instance }})
|
||||
description: "{{ $labels.instance }} requires a reboot.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
236
conf/prometheus/alerts/prometheus.yml
Normal file
236
conf/prometheus/alerts/prometheus.yml
Normal file
|
@ -0,0 +1,236 @@
|
|||
groups:
|
||||
- name: Prometheus
|
||||
rules:
|
||||
- alert: PrometheusJobMissing
|
||||
expr: absent(up{job="prometheus"})
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Prometheus job missing (instance {{ $labels.instance }})
|
||||
description: "A Prometheus job has disappeared\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PrometheusTargetMissing
|
||||
expr: up == 0
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Prometheus target missing (instance {{ $labels.instance }})
|
||||
description: "A Prometheus target has disappeared. An exporter might be crashed.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PrometheusAllTargetsMissing
|
||||
expr: count by (job) (up) == 0
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Prometheus all targets missing (instance {{ $labels.instance }})
|
||||
description: "A Prometheus job does not have living target anymore.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PrometheusConfigurationReloadFailure
|
||||
expr: prometheus_config_last_reload_successful != 1
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Prometheus configuration reload failure (instance {{ $labels.instance }})
|
||||
description: "Prometheus configuration reload error\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PrometheusTooManyRestarts
|
||||
expr: changes(process_start_time_seconds{job=~"prometheus|pushgateway|alertmanager"}[15m]) > 2
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Prometheus too many restarts (instance {{ $labels.instance }})
|
||||
description: "Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PrometheusAlertmanagerJobMissing
|
||||
expr: absent(up{job="alertmanager"})
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Prometheus AlertManager job missing (instance {{ $labels.instance }})
|
||||
description: "A Prometheus AlertManager job has disappeared\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PrometheusAlertmanagerConfigurationReloadFailure
|
||||
expr: alertmanager_config_last_reload_successful != 1
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }})
|
||||
description: "AlertManager configuration reload error\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PrometheusAlertmanagerConfigNotSynced
|
||||
expr: count(count_values("config_hash", alertmanager_config_hash)) > 1
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Prometheus AlertManager config not synced (instance {{ $labels.instance }})
|
||||
description: "Configurations of AlertManager cluster instances are out of sync\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PrometheusAlertmanagerE2eDeadManSwitch
|
||||
expr: vector(1)
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Prometheus AlertManager E2E dead man switch (instance {{ $labels.instance }})
|
||||
description: "Prometheus DeadManSwitch is an always-firing alert. It's used as an end-to-end test of Prometheus through the Alertmanager.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PrometheusNotConnectedToAlertmanager
|
||||
expr: prometheus_notifications_alertmanagers_discovered < 1
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Prometheus not connected to alertmanager (instance {{ $labels.instance }})
|
||||
description: "Prometheus cannot connect the alertmanager\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PrometheusRuleEvaluationFailures
|
||||
expr: increase(prometheus_rule_evaluation_failures_total[3m]) > 0
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Prometheus rule evaluation failures (instance {{ $labels.instance }})
|
||||
description: "Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PrometheusTemplateTextExpansionFailures
|
||||
expr: increase(prometheus_template_text_expansion_failures_total[3m]) > 0
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Prometheus template text expansion failures (instance {{ $labels.instance }})
|
||||
description: "Prometheus encountered {{ $value }} template text expansion failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PrometheusRuleEvaluationSlow
|
||||
expr: prometheus_rule_group_last_duration_seconds > prometheus_rule_group_interval_seconds
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Prometheus rule evaluation slow (instance {{ $labels.instance }})
|
||||
description: "Prometheus rule evaluation took more time than the scheduled interval. It indicates a slower storage backend access or too complex query.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PrometheusNotificationsBacklog
|
||||
expr: min_over_time(prometheus_notifications_queue_length[10m]) > 0
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Prometheus notifications backlog (instance {{ $labels.instance }})
|
||||
description: "The Prometheus notification queue has not been empty for 10 minutes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PrometheusAlertmanagerNotificationFailing
|
||||
expr: rate(alertmanager_notifications_failed_total[1m]) > 0
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Prometheus AlertManager notification failing (instance {{ $labels.instance }})
|
||||
description: "Alertmanager is failing sending notifications\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PrometheusTargetEmpty
|
||||
expr: prometheus_sd_discovered_targets == 0
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Prometheus target empty (instance {{ $labels.instance }})
|
||||
description: "Prometheus has no target in service discovery\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PrometheusTargetScrapingSlow
|
||||
expr: prometheus_target_interval_length_seconds{quantile="0.9"} > 60
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Prometheus target scraping slow (instance {{ $labels.instance }})
|
||||
description: "Prometheus is scraping exporters slowly\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PrometheusLargeScrape
|
||||
expr: increase(prometheus_target_scrapes_exceeded_sample_limit_total[10m]) > 10
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Prometheus large scrape (instance {{ $labels.instance }})
|
||||
description: "Prometheus has many scrapes that exceed the sample limit\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PrometheusTargetScrapeDuplicate
|
||||
expr: increase(prometheus_target_scrapes_sample_duplicate_timestamp_total[5m]) > 0
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Prometheus target scrape duplicate (instance {{ $labels.instance }})
|
||||
description: "Prometheus has many samples rejected due to duplicate timestamps but different values\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PrometheusTsdbCheckpointCreationFailures
|
||||
expr: increase(prometheus_tsdb_checkpoint_creations_failed_total[1m]) > 0
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Prometheus TSDB checkpoint creation failures (instance {{ $labels.instance }})
|
||||
description: "Prometheus encountered {{ $value }} checkpoint creation failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PrometheusTsdbCheckpointDeletionFailures
|
||||
expr: increase(prometheus_tsdb_checkpoint_deletions_failed_total[1m]) > 0
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Prometheus TSDB checkpoint deletion failures (instance {{ $labels.instance }})
|
||||
description: "Prometheus encountered {{ $value }} checkpoint deletion failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PrometheusTsdbCompactionsFailed
|
||||
expr: increase(prometheus_tsdb_compactions_failed_total[1m]) > 0
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Prometheus TSDB compactions failed (instance {{ $labels.instance }})
|
||||
description: "Prometheus encountered {{ $value }} TSDB compactions failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PrometheusTsdbHeadTruncationsFailed
|
||||
expr: increase(prometheus_tsdb_head_truncations_failed_total[1m]) > 0
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Prometheus TSDB head truncations failed (instance {{ $labels.instance }})
|
||||
description: "Prometheus encountered {{ $value }} TSDB head truncation failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PrometheusTsdbReloadFailures
|
||||
expr: increase(prometheus_tsdb_reloads_failures_total[1m]) > 0
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Prometheus TSDB reload failures (instance {{ $labels.instance }})
|
||||
description: "Prometheus encountered {{ $value }} TSDB reload failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PrometheusTsdbWalCorruptions
|
||||
expr: increase(prometheus_tsdb_wal_corruptions_total[1m]) > 0
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Prometheus TSDB WAL corruptions (instance {{ $labels.instance }})
|
||||
description: "Prometheus encountered {{ $value }} TSDB WAL corruptions\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PrometheusTsdbWalTruncationsFailed
|
||||
expr: increase(prometheus_tsdb_wal_truncations_failed_total[1m]) > 0
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }})
|
||||
description: "Prometheus encountered {{ $value }} TSDB WAL truncation failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
78
conf/prometheus/prometheus.yml
Normal file
78
conf/prometheus/prometheus.yml
Normal file
|
@ -0,0 +1,78 @@
|
|||
global:
|
||||
scrape_interval: 20s
|
||||
|
||||
rule_files:
|
||||
- alerts/*.yml
|
||||
|
||||
alerting:
|
||||
alertmanagers:
|
||||
- static_configs:
|
||||
- targets:
|
||||
- alertmanager:9093
|
||||
|
||||
scrape_configs:
|
||||
- job_name: "prometheus"
|
||||
scrape_interval: 5s
|
||||
static_configs:
|
||||
- targets: ["localhost:9090"]
|
||||
|
||||
- job_name: "alertmanager"
|
||||
scrape_interval: 5s
|
||||
static_configs:
|
||||
- targets: ["alertmanager:9093"]
|
||||
|
||||
- job_name: "node"
|
||||
static_configs:
|
||||
- targets: ["node-exporter:9100"]
|
||||
|
||||
- job_name: "cadvisor"
|
||||
static_configs:
|
||||
- targets: ["cadvisor:8080"]
|
||||
|
||||
- job_name: "node_nyyu"
|
||||
scheme: https
|
||||
metrics_path: /node/metrics
|
||||
static_configs:
|
||||
- targets: ["nyyu.dev:443"]
|
||||
|
||||
# - job_name: "smokeping"
|
||||
# static_configs:
|
||||
# - targets: ["smokeping:9374"]
|
||||
|
||||
- job_name: "blackbox"
|
||||
metrics_path: /probe
|
||||
params:
|
||||
module: [http_2xx]
|
||||
dns_sd_configs:
|
||||
- names:
|
||||
- nyyu.dev
|
||||
type: A
|
||||
port: 443
|
||||
relabel_configs:
|
||||
- source_labels: [__address__]
|
||||
target_label: __param_target
|
||||
replacement: https://$1/
|
||||
- source_labels: [__param_target]
|
||||
target_label: target
|
||||
- target_label: __address__
|
||||
replacement: blackbox:9115
|
||||
- source_labels: [__meta_dns_name]
|
||||
target_label: __param_hostname
|
||||
- source_labels: [__meta_dns_name]
|
||||
target_label: vhost
|
||||
|
||||
- job_name: "blackbox_icmp"
|
||||
metrics_path: /probe
|
||||
params:
|
||||
module: [icmp]
|
||||
static_configs:
|
||||
- targets:
|
||||
- 1.1.1.1
|
||||
- nyyu.dev
|
||||
relabel_configs:
|
||||
- source_labels: [__address__]
|
||||
target_label: __param_target
|
||||
- source_labels: [__param_target]
|
||||
target_label: target
|
||||
- target_label: __address__
|
||||
replacement: blackbox:9115
|
120
docker-compose.yml
Normal file
120
docker-compose.yml
Normal file
|
@ -0,0 +1,120 @@
|
|||
version: '3.8'
|
||||
|
||||
networks:
|
||||
monitoring:
|
||||
driver: bridge
|
||||
|
||||
services:
|
||||
grafana:
|
||||
# ARM bug in rate interval : https://github.com/grafana/grafana/issues/43002
|
||||
image: grafana/grafana:8.2.7
|
||||
container_name: grafana
|
||||
restart: unless-stopped
|
||||
volumes:
|
||||
- ./data/grafana:/var/lib/grafana
|
||||
- ./conf/grafana/provisioning:/etc/grafana/provisioning
|
||||
ports:
|
||||
- 3000:3000
|
||||
environment:
|
||||
- GF_SECURITY_ADMIN_USER=admin
|
||||
- GF_SECURITY_ADMIN_PASSWORD=${GRAFANA_PASSWORD}
|
||||
- GF_USERS_ALLOW_SIGN_UP=false
|
||||
networks:
|
||||
- monitoring
|
||||
prometheus:
|
||||
image: prom/prometheus
|
||||
container_name: prometheus
|
||||
restart: unless-stopped
|
||||
volumes:
|
||||
- ./conf/prometheus:/etc/prometheus
|
||||
- ./data/prometheus:/prometheus
|
||||
ports:
|
||||
- 9090:9090
|
||||
command:
|
||||
- '--config.file=/etc/prometheus/prometheus.yml'
|
||||
- '--storage.tsdb.path=/prometheus'
|
||||
- '--web.console.libraries=/etc/prometheus/console_libraries'
|
||||
- '--web.console.templates=/etc/prometheus/consoles'
|
||||
- '--web.enable-lifecycle'
|
||||
- "--web.external-url=http://${EXTERNAL_IP}:9090"
|
||||
expose:
|
||||
- 9090
|
||||
networks:
|
||||
- monitoring
|
||||
alertmanager:
|
||||
image: prom/alertmanager
|
||||
container_name: alertmanager
|
||||
restart: unless-stopped
|
||||
volumes:
|
||||
- ./conf/alertmanager:/etc/alertmanager
|
||||
- ./data/alertmanager:/alertmanager
|
||||
ports:
|
||||
- 9093:9093
|
||||
command:
|
||||
- '--config.file=/etc/alertmanager/alertmanager.yml'
|
||||
- '--storage.path=/alertmanager'
|
||||
- "--web.external-url=http://${EXTERNAL_IP}:9093"
|
||||
expose:
|
||||
- 9093
|
||||
networks:
|
||||
- monitoring
|
||||
node-exporter:
|
||||
image: prom/node-exporter
|
||||
container_name: node-exporter
|
||||
restart: unless-stopped
|
||||
volumes:
|
||||
- /proc:/host/proc:ro
|
||||
- /sys:/host/sys:ro
|
||||
- /:/rootfs:ro
|
||||
command:
|
||||
- '--path.procfs=/host/proc'
|
||||
- '--path.rootfs=/rootfs'
|
||||
- '--path.sysfs=/host/sys'
|
||||
- '--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)'
|
||||
expose:
|
||||
- 9100
|
||||
networks:
|
||||
- monitoring
|
||||
blackbox_exporter:
|
||||
image: prom/blackbox-exporter
|
||||
container_name: blackbox
|
||||
restart: unless-stopped
|
||||
expose:
|
||||
- 9115
|
||||
networks:
|
||||
- monitoring
|
||||
# smokeping:
|
||||
# image: quay.io/superq/smokeping-prober
|
||||
# container_name: smokeping
|
||||
# restart: unless-stopped
|
||||
# command: nyyu.dev
|
||||
# privileged: true
|
||||
# expose:
|
||||
# - 9374
|
||||
# networks:
|
||||
# - monitoring
|
||||
cadvisor:
|
||||
# ARM image
|
||||
image: justrobin/cadvisor:v0.44.0
|
||||
container_name: cadvisor
|
||||
restart: unless-stopped
|
||||
command:
|
||||
- '--housekeeping_interval=10s'
|
||||
- '--raw_cgroup_prefix_whitelist=/docker/'
|
||||
- '--disable_metrics=cpu_topology,hugetlb'
|
||||
privileged: true
|
||||
pid: 'host'
|
||||
ports:
|
||||
- '8040:8080'
|
||||
volumes:
|
||||
- '/:/rootfs:ro'
|
||||
- '/var/run:/var/run:ro'
|
||||
- '/sys:/sys:ro'
|
||||
- '/var/lib/docker/:/var/lib/docker:ro'
|
||||
- '/dev/disk/:/dev/disk:ro'
|
||||
devices:
|
||||
- '/dev/kmsg:/dev/kmsg'
|
||||
expose:
|
||||
- 8080
|
||||
networks:
|
||||
- monitoring
|
Loading…
Add table
Reference in a new issue