init
This commit is contained in:
commit
1ac93ba11b
16 changed files with 17834 additions and 0 deletions
3
.gitignore
vendored
Normal file
3
.gitignore
vendored
Normal file
|
@ -0,0 +1,3 @@
|
||||||
|
.env
|
||||||
|
alertmanager.yml
|
||||||
|
data
|
26
README.md
Normal file
26
README.md
Normal file
|
@ -0,0 +1,26 @@
|
||||||
|
# Docker monitoring stack
|
||||||
|
|
||||||
|
## Includes
|
||||||
|
* prometheus
|
||||||
|
* grafana
|
||||||
|
* alertmanager
|
||||||
|
* node-exporter
|
||||||
|
* blackbox-exporter
|
||||||
|
* cadvisor
|
||||||
|
|
||||||
|
## Prerequisites
|
||||||
|
* docker
|
||||||
|
* docker compose
|
||||||
|
|
||||||
|
## Steps
|
||||||
|
|
||||||
|
1. Create .env file
|
||||||
|
> EXTERNAL_IP=XXX.XXX.XXX.XXX
|
||||||
|
>
|
||||||
|
> GRAFANA_PASSWORD=SECURE_PASSWORD
|
||||||
|
|
||||||
|
2. Copy alertmanager.tmpl to alertmanager.yml
|
||||||
|
* Set telegram bot_token and chat_id
|
||||||
|
|
||||||
|
3. Deploy the stack
|
||||||
|
> docker compose up -d
|
17
conf/alertmanager/alertmanager.tmpl
Normal file
17
conf/alertmanager/alertmanager.tmpl
Normal file
|
@ -0,0 +1,17 @@
|
||||||
|
global:
|
||||||
|
templates:
|
||||||
|
- /etc/alertmanager/templates/*.tmpl
|
||||||
|
route:
|
||||||
|
group_wait: 10s
|
||||||
|
group_interval: 30s
|
||||||
|
repeat_interval: 30m
|
||||||
|
group_by: [alertname, instance]
|
||||||
|
receiver: telegram
|
||||||
|
receivers:
|
||||||
|
- name: telegram
|
||||||
|
telegram_configs:
|
||||||
|
- bot_token: BOT_TOKEN
|
||||||
|
chat_id: CHAT_ID
|
||||||
|
api_url: https://api.telegram.org
|
||||||
|
parse_mode: 'HTML'
|
||||||
|
message: '{{ template "telegram.custom.message" .}}'
|
11
conf/alertmanager/templates/telegram.tmpl
Normal file
11
conf/alertmanager/templates/telegram.tmpl
Normal file
|
@ -0,0 +1,11 @@
|
||||||
|
{{ define "telegram.custom.message" }}
|
||||||
|
{{ range .Alerts }}
|
||||||
|
{{ if eq .Status "firing"}}🔥<b>{{ .Labels.alertname }}</b>🔥{{ else }}👌<b>{{ .Labels.alertname }}</b>👌{{ end }}
|
||||||
|
<b>Labels:</b>{{ range $key, $value := .Labels }}{{ if ne $key "alertname" }}
|
||||||
|
- {{ $key }}: {{ $value }}{{ end }}{{ end }}
|
||||||
|
<b>Annotations:</b>{{ range $key, $value := .Annotations }}
|
||||||
|
- {{ $key }}: {{ reReplaceAll "(?s)LABELS = (.*)" "" $value }}{{ end }}
|
||||||
|
<b>Start:</b> {{ .StartsAt }}{{ if eq .Status "resolved"}}
|
||||||
|
<b>Ended:</b> {{ .EndsAt }}{{ end }}
|
||||||
|
{{ end }}
|
||||||
|
{{ end }}
|
14116
conf/grafana/provisioning/dashboards/Node Exporter Full.json
Normal file
14116
conf/grafana/provisioning/dashboards/Node Exporter Full.json
Normal file
File diff suppressed because it is too large
Load diff
1383
conf/grafana/provisioning/dashboards/blackbox.json
Normal file
1383
conf/grafana/provisioning/dashboards/blackbox.json
Normal file
File diff suppressed because it is too large
Load diff
11
conf/grafana/provisioning/dashboards/dashboard.yml
Normal file
11
conf/grafana/provisioning/dashboards/dashboard.yml
Normal file
|
@ -0,0 +1,11 @@
|
||||||
|
apiVersion: 1
|
||||||
|
|
||||||
|
providers:
|
||||||
|
- name: 'General'
|
||||||
|
orgId: 1
|
||||||
|
folder: ''
|
||||||
|
type: file
|
||||||
|
disableDeletion: false
|
||||||
|
editable: true
|
||||||
|
options:
|
||||||
|
path: /etc/grafana/provisioning/dashboards
|
1103
conf/grafana/provisioning/dashboards/docker.json
Normal file
1103
conf/grafana/provisioning/dashboards/docker.json
Normal file
File diff suppressed because it is too large
Load diff
230
conf/grafana/provisioning/dashboards/smokeping.json.bak
Normal file
230
conf/grafana/provisioning/dashboards/smokeping.json.bak
Normal file
|
@ -0,0 +1,230 @@
|
||||||
|
{
|
||||||
|
"__inputs": [
|
||||||
|
{
|
||||||
|
"name": "DS_PROMETHEUS",
|
||||||
|
"label": "Prometheus",
|
||||||
|
"description": "",
|
||||||
|
"type": "datasource",
|
||||||
|
"pluginId": "prometheus",
|
||||||
|
"pluginName": "Prometheus"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"__requires": [
|
||||||
|
{
|
||||||
|
"type": "grafana",
|
||||||
|
"id": "grafana",
|
||||||
|
"name": "Grafana",
|
||||||
|
"version": "7.3.7"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "panel",
|
||||||
|
"id": "heatmap",
|
||||||
|
"name": "Heatmap",
|
||||||
|
"version": ""
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "datasource",
|
||||||
|
"id": "prometheus",
|
||||||
|
"name": "Prometheus",
|
||||||
|
"version": "1.0.0"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"annotations": {
|
||||||
|
"list": [
|
||||||
|
{
|
||||||
|
"builtIn": 1,
|
||||||
|
"datasource": "-- Grafana --",
|
||||||
|
"enable": true,
|
||||||
|
"hide": true,
|
||||||
|
"iconColor": "rgba(0, 211, 255, 1)",
|
||||||
|
"name": "Annotations & Alerts",
|
||||||
|
"type": "dashboard"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"editable": true,
|
||||||
|
"fiscalYearStartMonth": 0,
|
||||||
|
"gnetId": null,
|
||||||
|
"graphTooltip": 0,
|
||||||
|
"id": 2,
|
||||||
|
"iteration": 1650541706979,
|
||||||
|
"links": [],
|
||||||
|
"liveNow": false,
|
||||||
|
"panels": [
|
||||||
|
{
|
||||||
|
"cards": {
|
||||||
|
"cardPadding": null,
|
||||||
|
"cardRound": null
|
||||||
|
},
|
||||||
|
"color": {
|
||||||
|
"cardColor": "#b4ff00",
|
||||||
|
"colorScale": "sqrt",
|
||||||
|
"colorScheme": "interpolateOranges",
|
||||||
|
"exponent": 0.5,
|
||||||
|
"mode": "opacity"
|
||||||
|
},
|
||||||
|
"dataFormat": "tsbuckets",
|
||||||
|
"datasource": "${DS_PROMETHEUS}",
|
||||||
|
"gridPos": {
|
||||||
|
"h": 17,
|
||||||
|
"w": 24,
|
||||||
|
"x": 0,
|
||||||
|
"y": 0
|
||||||
|
},
|
||||||
|
"heatmap": {},
|
||||||
|
"hideZeroBuckets": false,
|
||||||
|
"highlightCards": true,
|
||||||
|
"id": 2,
|
||||||
|
"legend": {
|
||||||
|
"show": false
|
||||||
|
},
|
||||||
|
"links": [],
|
||||||
|
"reverseYBuckets": false,
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"expr": "sum(rate(smokeping_response_duration_seconds_bucket{instance=~\"$prober\",host=\"$target\"}[5m])) by (le)",
|
||||||
|
"format": "heatmap",
|
||||||
|
"intervalFactor": 1,
|
||||||
|
"legendFormat": "{{le}}",
|
||||||
|
"refId": "A"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"title": "Smokeping",
|
||||||
|
"tooltip": {
|
||||||
|
"show": true,
|
||||||
|
"showHistogram": false
|
||||||
|
},
|
||||||
|
"type": "heatmap",
|
||||||
|
"xAxis": {
|
||||||
|
"show": true
|
||||||
|
},
|
||||||
|
"xBucketNumber": null,
|
||||||
|
"xBucketSize": null,
|
||||||
|
"yAxis": {
|
||||||
|
"decimals": 0,
|
||||||
|
"format": "s",
|
||||||
|
"logBase": 1,
|
||||||
|
"max": null,
|
||||||
|
"min": "0",
|
||||||
|
"show": true,
|
||||||
|
"splitFactor": null
|
||||||
|
},
|
||||||
|
"yBucketBound": "auto",
|
||||||
|
"yBucketNumber": null,
|
||||||
|
"yBucketSize": null
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"refresh": "30s",
|
||||||
|
"schemaVersion": 32,
|
||||||
|
"style": "dark",
|
||||||
|
"tags": [],
|
||||||
|
"templating": {
|
||||||
|
"list": [
|
||||||
|
{
|
||||||
|
"allValue": null,
|
||||||
|
"current": {},
|
||||||
|
"datasource": "${DS_PROMETHEUS}",
|
||||||
|
"definition": "",
|
||||||
|
"description": null,
|
||||||
|
"error": null,
|
||||||
|
"hide": 0,
|
||||||
|
"includeAll": false,
|
||||||
|
"label": null,
|
||||||
|
"multi": true,
|
||||||
|
"name": "prober",
|
||||||
|
"options": [],
|
||||||
|
"query": {
|
||||||
|
"query": "label_values(smokeping_prober_build_info, instance)",
|
||||||
|
"refId": "Prometheus-prober-Variable-Query"
|
||||||
|
},
|
||||||
|
"refresh": 1,
|
||||||
|
"regex": "",
|
||||||
|
"skipUrlSync": false,
|
||||||
|
"sort": 1,
|
||||||
|
"tagValuesQuery": "",
|
||||||
|
"tagsQuery": "",
|
||||||
|
"type": "query",
|
||||||
|
"useTags": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"allValue": null,
|
||||||
|
"current": {},
|
||||||
|
"datasource": "${DS_PROMETHEUS}",
|
||||||
|
"definition": "label_values(smokeping_response_duration_seconds_bucket, host)",
|
||||||
|
"description": null,
|
||||||
|
"error": null,
|
||||||
|
"hide": 0,
|
||||||
|
"includeAll": false,
|
||||||
|
"label": null,
|
||||||
|
"multi": false,
|
||||||
|
"name": "target",
|
||||||
|
"options": [],
|
||||||
|
"query": {
|
||||||
|
"query": "label_values(smokeping_response_duration_seconds_bucket, host)",
|
||||||
|
"refId": "Prometheus-target-Variable-Query"
|
||||||
|
},
|
||||||
|
"refresh": 1,
|
||||||
|
"regex": "",
|
||||||
|
"skipUrlSync": false,
|
||||||
|
"sort": 1,
|
||||||
|
"tagValuesQuery": "",
|
||||||
|
"tagsQuery": "",
|
||||||
|
"type": "query",
|
||||||
|
"useTags": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"current": {
|
||||||
|
"selected": false,
|
||||||
|
"text": "Prometheus",
|
||||||
|
"value": "Prometheus"
|
||||||
|
},
|
||||||
|
"description": null,
|
||||||
|
"error": null,
|
||||||
|
"hide": 2,
|
||||||
|
"includeAll": false,
|
||||||
|
"label": null,
|
||||||
|
"multi": false,
|
||||||
|
"name": "DS_PROMETHEUS",
|
||||||
|
"options": [],
|
||||||
|
"query": "prometheus",
|
||||||
|
"refresh": 1,
|
||||||
|
"regex": "",
|
||||||
|
"skipUrlSync": false,
|
||||||
|
"type": "datasource"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"time": {
|
||||||
|
"from": "now-1h",
|
||||||
|
"to": "now"
|
||||||
|
},
|
||||||
|
"timepicker": {
|
||||||
|
"refresh_intervals": [
|
||||||
|
"5s",
|
||||||
|
"10s",
|
||||||
|
"30s",
|
||||||
|
"1m",
|
||||||
|
"5m",
|
||||||
|
"15m",
|
||||||
|
"30m",
|
||||||
|
"1h",
|
||||||
|
"2h",
|
||||||
|
"1d"
|
||||||
|
],
|
||||||
|
"time_options": [
|
||||||
|
"5m",
|
||||||
|
"15m",
|
||||||
|
"1h",
|
||||||
|
"6h",
|
||||||
|
"12h",
|
||||||
|
"24h",
|
||||||
|
"2d",
|
||||||
|
"7d",
|
||||||
|
"30d"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"timezone": "browser",
|
||||||
|
"title": "Smokeping",
|
||||||
|
"uid": "i5aRaLaik",
|
||||||
|
"version": 1
|
||||||
|
}
|
50
conf/grafana/provisioning/datasources/datasource.yml
Normal file
50
conf/grafana/provisioning/datasources/datasource.yml
Normal file
|
@ -0,0 +1,50 @@
|
||||||
|
# config file version
|
||||||
|
apiVersion: 1
|
||||||
|
|
||||||
|
# list of datasources that should be deleted from the database
|
||||||
|
deleteDatasources:
|
||||||
|
- name: Prometheus
|
||||||
|
orgId: 1
|
||||||
|
|
||||||
|
# list of datasources to insert/update depending
|
||||||
|
# whats available in the database
|
||||||
|
datasources:
|
||||||
|
# <string, required> name of the datasource. Required
|
||||||
|
- name: Prometheus
|
||||||
|
# <string, required> datasource type. Required
|
||||||
|
type: prometheus
|
||||||
|
# <string, required> access mode. direct or proxy. Required
|
||||||
|
access: proxy
|
||||||
|
# <int> org id. will default to orgId 1 if not specified
|
||||||
|
orgId: 1
|
||||||
|
# <string> url
|
||||||
|
url: http://prometheus:9090
|
||||||
|
# <string> database password, if used
|
||||||
|
password:
|
||||||
|
# <string> database user, if used
|
||||||
|
user:
|
||||||
|
# <string> database name, if used
|
||||||
|
database:
|
||||||
|
# <bool> enable/disable basic auth
|
||||||
|
basicAuth: false
|
||||||
|
# <string> basic auth username, if used
|
||||||
|
basicAuthUser:
|
||||||
|
# <string> basic auth password, if used
|
||||||
|
basicAuthPassword:
|
||||||
|
# <bool> enable/disable with credentials headers
|
||||||
|
withCredentials:
|
||||||
|
# <bool> mark as default datasource. Max one per org
|
||||||
|
isDefault: true
|
||||||
|
# <map> fields that will be converted to json and stored in json_data
|
||||||
|
jsonData:
|
||||||
|
graphiteVersion: "1.1"
|
||||||
|
tlsAuth: false
|
||||||
|
tlsAuthWithCACert: false
|
||||||
|
# <string> json object of data that will be encrypted.
|
||||||
|
secureJsonData:
|
||||||
|
tlsCACert: "..."
|
||||||
|
tlsClientCert: "..."
|
||||||
|
tlsClientKey: "..."
|
||||||
|
version: 1
|
||||||
|
# <bool> allow users to edit datasources from the UI.
|
||||||
|
editable: true
|
74
conf/prometheus/alerts/blackbox.yml
Normal file
74
conf/prometheus/alerts/blackbox.yml
Normal file
|
@ -0,0 +1,74 @@
|
||||||
|
groups:
|
||||||
|
- name: blackbox
|
||||||
|
rules:
|
||||||
|
- alert: BlackboxProbeFailed
|
||||||
|
expr: probe_success == 0
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: Blackbox probe failed (instance {{ $labels.instance }})
|
||||||
|
description: "Probe failed\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: BlackboxSlowProbe
|
||||||
|
expr: avg_over_time(probe_duration_seconds[1m]) > 5
|
||||||
|
for: 1m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Blackbox slow probe (instance {{ $labels.instance }})
|
||||||
|
description: "Blackbox probe took more than 5s to complete\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: BlackboxProbeHttpFailure
|
||||||
|
expr: probe_http_status_code <= 199 OR probe_http_status_code >= 400
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: Blackbox probe HTTP failure (instance {{ $labels.instance }})
|
||||||
|
description: "HTTP status code is not 200-399\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: BlackboxSslCertificateWillExpireSoon
|
||||||
|
expr: probe_ssl_earliest_cert_expiry - time() < 86400 * 30
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Blackbox SSL certificate will expire soon (instance {{ $labels.instance }})
|
||||||
|
description: "SSL certificate expires in 30 days\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: BlackboxSslCertificateWillExpireSoon
|
||||||
|
expr: probe_ssl_earliest_cert_expiry - time() < 86400 * 3
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: Blackbox SSL certificate will expire soon (instance {{ $labels.instance }})
|
||||||
|
description: "SSL certificate expires in 3 days\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: BlackboxSslCertificateExpired
|
||||||
|
expr: probe_ssl_earliest_cert_expiry - time() <= 0
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: Blackbox SSL certificate expired (instance {{ $labels.instance }})
|
||||||
|
description: "SSL certificate has expired already\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: BlackboxProbeSlowHttp
|
||||||
|
expr: avg_over_time(probe_http_duration_seconds[1m]) > 1
|
||||||
|
for: 1m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Blackbox probe slow HTTP (instance {{ $labels.instance }})
|
||||||
|
description: "HTTP request took more than 1s\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: BlackboxProbeSlowPing
|
||||||
|
expr: avg_over_time(probe_icmp_duration_seconds[1m]) > 1
|
||||||
|
for: 1m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Blackbox probe slow ping (instance {{ $labels.instance }})
|
||||||
|
description: "Blackbox ping took more than 1s\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
68
conf/prometheus/alerts/cadvisor.yml
Normal file
68
conf/prometheus/alerts/cadvisor.yml
Normal file
|
@ -0,0 +1,68 @@
|
||||||
|
groups:
|
||||||
|
- name: cadvisor
|
||||||
|
rules:
|
||||||
|
# This rule can be very noisy in dynamic infra with legitimate container start/stop/deployment.
|
||||||
|
- alert: ContainerKilled
|
||||||
|
expr: time() - container_last_seen > 60
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Container killed (instance {{ $labels.instance }})
|
||||||
|
description: "A container has disappeared\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
# This rule can be very noisy in dynamic infra with legitimate container start/stop/deployment.
|
||||||
|
- alert: ContainerAbsent
|
||||||
|
expr: absent(container_last_seen)
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Container absent (instance {{ $labels.instance }})
|
||||||
|
description: "A container is absent for 5 min\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: ContainerCpuUsage
|
||||||
|
expr: (sum(rate(container_cpu_usage_seconds_total{name!=""}[3m])) BY (instance, name) * 100) > 80
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Container CPU usage (instance {{ $labels.instance }})
|
||||||
|
description: "Container CPU usage is above 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
# See https://medium.com/faun/how-much-is-too-much-the-linux-oomkiller-and-used-memory-d32186f29c9d
|
||||||
|
- alert: ContainerMemoryUsage
|
||||||
|
expr: (sum(container_memory_working_set_bytes{name!=""}) BY (instance, name) / sum(container_spec_memory_limit_bytes > 0) BY (instance, name) * 100) > 80
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Container Memory usage (instance {{ $labels.instance }})
|
||||||
|
description: "Container Memory usage is above 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: ContainerVolumeUsage
|
||||||
|
expr: (1 - (sum(container_fs_inodes_free{name!=""}) BY (instance) / sum(container_fs_inodes_total) BY (instance))) * 100 > 80
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Container Volume usage (instance {{ $labels.instance }})
|
||||||
|
description: "Container Volume usage is above 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: ContainerVolumeIoUsage
|
||||||
|
expr: (sum(container_fs_io_current{name!=""}) BY (instance, name) * 100) > 80
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Container Volume IO usage (instance {{ $labels.instance }})
|
||||||
|
description: "Container Volume IO usage is above 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: ContainerHighThrottleRate
|
||||||
|
expr: rate(container_cpu_cfs_throttled_seconds_total[3m]) > 1
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Container high throttle rate (instance {{ $labels.instance }})
|
||||||
|
description: "Container is being throttled\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
308
conf/prometheus/alerts/node.yml
Normal file
308
conf/prometheus/alerts/node.yml
Normal file
|
@ -0,0 +1,308 @@
|
||||||
|
groups:
|
||||||
|
- name: Node
|
||||||
|
rules:
|
||||||
|
- alert: HostOutOfMemory
|
||||||
|
expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Host out of memory (instance {{ $labels.instance }})
|
||||||
|
description: "Node memory is filling up (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: HostMemoryUnderMemoryPressure
|
||||||
|
expr: rate(node_vmstat_pgmajfault[1m]) > 1000
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Host memory under memory pressure (instance {{ $labels.instance }})
|
||||||
|
description: "The node is under heavy memory pressure. High rate of major page faults\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: HostUnusualNetworkThroughputIn
|
||||||
|
expr: sum by (instance) (rate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Host unusual network throughput in (instance {{ $labels.instance }})
|
||||||
|
description: "Host network interfaces are probably receiving too much data (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: HostUnusualNetworkThroughputOut
|
||||||
|
expr: sum by (instance) (rate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Host unusual network throughput out (instance {{ $labels.instance }})
|
||||||
|
description: "Host network interfaces are probably sending too much data (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: HostUnusualDiskReadRate
|
||||||
|
expr: sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Host unusual disk read rate (instance {{ $labels.instance }})
|
||||||
|
description: "Disk is probably reading too much data (> 50 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: HostUnusualDiskWriteRate
|
||||||
|
expr: sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Host unusual disk write rate (instance {{ $labels.instance }})
|
||||||
|
description: "Disk is probably writing too much data (> 50 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
# Please add ignored mountpoints in node_exporter parameters like
|
||||||
|
# "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)".
|
||||||
|
# Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users.
|
||||||
|
- alert: HostOutOfDiskSpace
|
||||||
|
expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Host out of disk space (instance {{ $labels.instance }})
|
||||||
|
description: "Disk is almost full (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
# Please add ignored mountpoints in node_exporter parameters like
|
||||||
|
# "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)".
|
||||||
|
# Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users.
|
||||||
|
- alert: HostDiskWillFillIn24Hours
|
||||||
|
expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly == 0
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Host disk will fill in 24 hours (instance {{ $labels.instance }})
|
||||||
|
description: "Filesystem is predicted to run out of space within the next 24 hours at current write rate\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: HostOutOfInodes
|
||||||
|
expr: node_filesystem_files_free{mountpoint ="/rootfs"} / node_filesystem_files{mountpoint="/rootfs"} * 100 < 10 and ON (instance, device, mountpoint) node_filesystem_readonly{mountpoint="/rootfs"} == 0
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Host out of inodes (instance {{ $labels.instance }})
|
||||||
|
description: "Disk is almost running out of available inodes (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: HostInodesWillFillIn24Hours
|
||||||
|
expr: node_filesystem_files_free{mountpoint ="/rootfs"} / node_filesystem_files{mountpoint="/rootfs"} * 100 < 10 and predict_linear(node_filesystem_files_free{mountpoint="/rootfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly{mountpoint="/rootfs"} == 0
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Host inodes will fill in 24 hours (instance {{ $labels.instance }})
|
||||||
|
description: "Filesystem is predicted to run out of inodes within the next 24 hours at current write rate\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: HostUnusualDiskReadLatency
|
||||||
|
expr: rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Host unusual disk read latency (instance {{ $labels.instance }})
|
||||||
|
description: "Disk latency is growing (read operations > 100ms)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: HostUnusualDiskWriteLatency
|
||||||
|
expr: rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 0.1 and rate(node_disk_writes_completed_total[1m]) > 0
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Host unusual disk write latency (instance {{ $labels.instance }})
|
||||||
|
description: "Disk latency is growing (write operations > 100ms)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: HostHighCpuLoad
|
||||||
|
expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[2m])) * 100) > 80
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Host high CPU load (instance {{ $labels.instance }})
|
||||||
|
description: "CPU load is > 80%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: HostCpuStealNoisyNeighbor
|
||||||
|
expr: avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Host CPU steal noisy neighbor (instance {{ $labels.instance }})
|
||||||
|
description: "CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
# 1000 context switches is an arbitrary number.
|
||||||
|
# Alert threshold depends on nature of application.
|
||||||
|
# Please read: https://github.com/samber/awesome-prometheus-alerts/issues/58
|
||||||
|
- alert: HostContextSwitching
|
||||||
|
expr: (rate(node_context_switches_total[5m])) / (count without(cpu, mode) (node_cpu_seconds_total{mode="idle"})) > 2000
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Host context switching (instance {{ $labels.instance }})
|
||||||
|
description: "Context switching is growing on node (> 2000 / s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: HostSwapIsFillingUp
|
||||||
|
expr: (1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Host swap is filling up (instance {{ $labels.instance }})
|
||||||
|
description: "Swap is filling up (>80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: HostSystemdServiceCrashed
|
||||||
|
expr: node_systemd_unit_state{state="failed"} == 1
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Host systemd service crashed (instance {{ $labels.instance }})
|
||||||
|
description: "systemd service crashed\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: HostPhysicalComponentTooHot
|
||||||
|
expr: node_hwmon_temp_celsius > 75
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Host physical component too hot (instance {{ $labels.instance }})
|
||||||
|
description: "Physical hardware component too hot\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: HostNodeOvertemperatureAlarm
|
||||||
|
expr: node_hwmon_temp_crit_alarm_celsius == 1
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: Host node overtemperature alarm (instance {{ $labels.instance }})
|
||||||
|
description: "Physical node temperature alarm triggered\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: HostRaidArrayGotInactive
|
||||||
|
expr: node_md_state{state="inactive"} > 0
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: Host RAID array got inactive (instance {{ $labels.instance }})
|
||||||
|
description: "RAID array {{ $labels.device }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: HostRaidDiskFailure
|
||||||
|
expr: node_md_disks{state="failed"} > 0
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Host RAID disk failure (instance {{ $labels.instance }})
|
||||||
|
description: "At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
#- alert: HostKernelVersionDeviations
|
||||||
|
# expr: count(sum(label_replace(node_uname_info, "kernel", "$1", "release", "([0-9]+.[0-9]+.[0-9]+).*")) by (kernel)) > 1
|
||||||
|
# for: 6h
|
||||||
|
# labels:
|
||||||
|
# severity: warning
|
||||||
|
# annotations:
|
||||||
|
# summary: Host kernel version deviations (instance {{ $labels.instance }})
|
||||||
|
# description: "Different kernel versions are running\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: HostOomKillDetected
|
||||||
|
expr: increase(node_vmstat_oom_kill[1m]) > 0
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Host OOM kill detected (instance {{ $labels.instance }})
|
||||||
|
description: "OOM kill detected\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: HostEdacCorrectableErrorsDetected
|
||||||
|
expr: increase(node_edac_correctable_errors_total[1m]) > 0
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: info
|
||||||
|
annotations:
|
||||||
|
summary: Host EDAC Correctable Errors detected (instance {{ $labels.instance }})
|
||||||
|
description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: HostEdacUncorrectableErrorsDetected
|
||||||
|
expr: node_edac_uncorrectable_errors_total > 0
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }})
|
||||||
|
description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: HostNetworkReceiveErrors
|
||||||
|
expr: rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Host Network Receive Errors (instance {{ $labels.instance }})
|
||||||
|
description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} receive errors in the last two minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: HostNetworkTransmitErrors
|
||||||
|
expr: rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Host Network Transmit Errors (instance {{ $labels.instance }})
|
||||||
|
description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} transmit errors in the last two minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: HostNetworkInterfaceSaturated
|
||||||
|
expr: (rate(node_network_receive_bytes_total{device!~"^tap.*"}[1m]) + rate(node_network_transmit_bytes_total{device!~"^tap.*"}[1m])) / node_network_speed_bytes{device!~"^tap.*"} > 0.8 < 10000
|
||||||
|
for: 1m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Host Network Interface Saturated (instance {{ $labels.instance }})
|
||||||
|
description: "The network interface \"{{ $labels.device }}\" on \"{{ $labels.instance }}\" is getting overloaded.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: HostNetworkBondDegraded
|
||||||
|
expr: (node_bonding_active - node_bonding_slaves) != 0
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Host Network Bond Degraded (instance {{ $labels.instance }})
|
||||||
|
description: "Bond \"{{ $labels.device }}\" degraded on \"{{ $labels.instance }}\".\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: HostConntrackLimit
|
||||||
|
expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Host conntrack limit (instance {{ $labels.instance }})
|
||||||
|
description: "The number of conntrack is approaching limit\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: HostClockSkew
|
||||||
|
expr: (node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0)
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Host clock skew (instance {{ $labels.instance }})
|
||||||
|
description: "Clock skew detected. Clock is out of sync. Ensure NTP is configured correctly on this host.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: HostClockNotSynchronising
|
||||||
|
expr: min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Host clock not synchronising (instance {{ $labels.instance }})
|
||||||
|
description: "Clock not synchronising. Ensure NTP is configured on this host.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: HostRequiresReboot
|
||||||
|
expr: node_reboot_required > 0
|
||||||
|
for: 4h
|
||||||
|
labels:
|
||||||
|
severity: info
|
||||||
|
annotations:
|
||||||
|
summary: Host requires reboot (instance {{ $labels.instance }})
|
||||||
|
description: "{{ $labels.instance }} requires a reboot.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
236
conf/prometheus/alerts/prometheus.yml
Normal file
236
conf/prometheus/alerts/prometheus.yml
Normal file
|
@ -0,0 +1,236 @@
|
||||||
|
groups:
|
||||||
|
- name: Prometheus
|
||||||
|
rules:
|
||||||
|
- alert: PrometheusJobMissing
|
||||||
|
expr: absent(up{job="prometheus"})
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Prometheus job missing (instance {{ $labels.instance }})
|
||||||
|
description: "A Prometheus job has disappeared\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: PrometheusTargetMissing
|
||||||
|
expr: up == 0
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: Prometheus target missing (instance {{ $labels.instance }})
|
||||||
|
description: "A Prometheus target has disappeared. An exporter might be crashed.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: PrometheusAllTargetsMissing
|
||||||
|
expr: count by (job) (up) == 0
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: Prometheus all targets missing (instance {{ $labels.instance }})
|
||||||
|
description: "A Prometheus job does not have living target anymore.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: PrometheusConfigurationReloadFailure
|
||||||
|
expr: prometheus_config_last_reload_successful != 1
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Prometheus configuration reload failure (instance {{ $labels.instance }})
|
||||||
|
description: "Prometheus configuration reload error\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: PrometheusTooManyRestarts
|
||||||
|
expr: changes(process_start_time_seconds{job=~"prometheus|pushgateway|alertmanager"}[15m]) > 2
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Prometheus too many restarts (instance {{ $labels.instance }})
|
||||||
|
description: "Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: PrometheusAlertmanagerJobMissing
|
||||||
|
expr: absent(up{job="alertmanager"})
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Prometheus AlertManager job missing (instance {{ $labels.instance }})
|
||||||
|
description: "A Prometheus AlertManager job has disappeared\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: PrometheusAlertmanagerConfigurationReloadFailure
|
||||||
|
expr: alertmanager_config_last_reload_successful != 1
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }})
|
||||||
|
description: "AlertManager configuration reload error\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: PrometheusAlertmanagerConfigNotSynced
|
||||||
|
expr: count(count_values("config_hash", alertmanager_config_hash)) > 1
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Prometheus AlertManager config not synced (instance {{ $labels.instance }})
|
||||||
|
description: "Configurations of AlertManager cluster instances are out of sync\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: PrometheusAlertmanagerE2eDeadManSwitch
|
||||||
|
expr: vector(1)
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: Prometheus AlertManager E2E dead man switch (instance {{ $labels.instance }})
|
||||||
|
description: "Prometheus DeadManSwitch is an always-firing alert. It's used as an end-to-end test of Prometheus through the Alertmanager.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: PrometheusNotConnectedToAlertmanager
|
||||||
|
expr: prometheus_notifications_alertmanagers_discovered < 1
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: Prometheus not connected to alertmanager (instance {{ $labels.instance }})
|
||||||
|
description: "Prometheus cannot connect the alertmanager\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: PrometheusRuleEvaluationFailures
|
||||||
|
expr: increase(prometheus_rule_evaluation_failures_total[3m]) > 0
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: Prometheus rule evaluation failures (instance {{ $labels.instance }})
|
||||||
|
description: "Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: PrometheusTemplateTextExpansionFailures
|
||||||
|
expr: increase(prometheus_template_text_expansion_failures_total[3m]) > 0
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: Prometheus template text expansion failures (instance {{ $labels.instance }})
|
||||||
|
description: "Prometheus encountered {{ $value }} template text expansion failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: PrometheusRuleEvaluationSlow
|
||||||
|
expr: prometheus_rule_group_last_duration_seconds > prometheus_rule_group_interval_seconds
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Prometheus rule evaluation slow (instance {{ $labels.instance }})
|
||||||
|
description: "Prometheus rule evaluation took more time than the scheduled interval. It indicates a slower storage backend access or too complex query.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: PrometheusNotificationsBacklog
|
||||||
|
expr: min_over_time(prometheus_notifications_queue_length[10m]) > 0
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Prometheus notifications backlog (instance {{ $labels.instance }})
|
||||||
|
description: "The Prometheus notification queue has not been empty for 10 minutes\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: PrometheusAlertmanagerNotificationFailing
|
||||||
|
expr: rate(alertmanager_notifications_failed_total[1m]) > 0
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: Prometheus AlertManager notification failing (instance {{ $labels.instance }})
|
||||||
|
description: "Alertmanager is failing sending notifications\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: PrometheusTargetEmpty
|
||||||
|
expr: prometheus_sd_discovered_targets == 0
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: Prometheus target empty (instance {{ $labels.instance }})
|
||||||
|
description: "Prometheus has no target in service discovery\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: PrometheusTargetScrapingSlow
|
||||||
|
expr: prometheus_target_interval_length_seconds{quantile="0.9"} > 60
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Prometheus target scraping slow (instance {{ $labels.instance }})
|
||||||
|
description: "Prometheus is scraping exporters slowly\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: PrometheusLargeScrape
|
||||||
|
expr: increase(prometheus_target_scrapes_exceeded_sample_limit_total[10m]) > 10
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Prometheus large scrape (instance {{ $labels.instance }})
|
||||||
|
description: "Prometheus has many scrapes that exceed the sample limit\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: PrometheusTargetScrapeDuplicate
|
||||||
|
expr: increase(prometheus_target_scrapes_sample_duplicate_timestamp_total[5m]) > 0
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
annotations:
|
||||||
|
summary: Prometheus target scrape duplicate (instance {{ $labels.instance }})
|
||||||
|
description: "Prometheus has many samples rejected due to duplicate timestamps but different values\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: PrometheusTsdbCheckpointCreationFailures
|
||||||
|
expr: increase(prometheus_tsdb_checkpoint_creations_failed_total[1m]) > 0
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: Prometheus TSDB checkpoint creation failures (instance {{ $labels.instance }})
|
||||||
|
description: "Prometheus encountered {{ $value }} checkpoint creation failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: PrometheusTsdbCheckpointDeletionFailures
|
||||||
|
expr: increase(prometheus_tsdb_checkpoint_deletions_failed_total[1m]) > 0
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: Prometheus TSDB checkpoint deletion failures (instance {{ $labels.instance }})
|
||||||
|
description: "Prometheus encountered {{ $value }} checkpoint deletion failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: PrometheusTsdbCompactionsFailed
|
||||||
|
expr: increase(prometheus_tsdb_compactions_failed_total[1m]) > 0
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: Prometheus TSDB compactions failed (instance {{ $labels.instance }})
|
||||||
|
description: "Prometheus encountered {{ $value }} TSDB compactions failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: PrometheusTsdbHeadTruncationsFailed
|
||||||
|
expr: increase(prometheus_tsdb_head_truncations_failed_total[1m]) > 0
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: Prometheus TSDB head truncations failed (instance {{ $labels.instance }})
|
||||||
|
description: "Prometheus encountered {{ $value }} TSDB head truncation failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: PrometheusTsdbReloadFailures
|
||||||
|
expr: increase(prometheus_tsdb_reloads_failures_total[1m]) > 0
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: Prometheus TSDB reload failures (instance {{ $labels.instance }})
|
||||||
|
description: "Prometheus encountered {{ $value }} TSDB reload failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: PrometheusTsdbWalCorruptions
|
||||||
|
expr: increase(prometheus_tsdb_wal_corruptions_total[1m]) > 0
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: Prometheus TSDB WAL corruptions (instance {{ $labels.instance }})
|
||||||
|
description: "Prometheus encountered {{ $value }} TSDB WAL corruptions\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||||
|
|
||||||
|
- alert: PrometheusTsdbWalTruncationsFailed
|
||||||
|
expr: increase(prometheus_tsdb_wal_truncations_failed_total[1m]) > 0
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
annotations:
|
||||||
|
summary: Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }})
|
||||||
|
description: "Prometheus encountered {{ $value }} TSDB WAL truncation failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
78
conf/prometheus/prometheus.yml
Normal file
78
conf/prometheus/prometheus.yml
Normal file
|
@ -0,0 +1,78 @@
|
||||||
|
global:
|
||||||
|
scrape_interval: 20s
|
||||||
|
|
||||||
|
rule_files:
|
||||||
|
- alerts/*.yml
|
||||||
|
|
||||||
|
alerting:
|
||||||
|
alertmanagers:
|
||||||
|
- static_configs:
|
||||||
|
- targets:
|
||||||
|
- alertmanager:9093
|
||||||
|
|
||||||
|
scrape_configs:
|
||||||
|
- job_name: "prometheus"
|
||||||
|
scrape_interval: 5s
|
||||||
|
static_configs:
|
||||||
|
- targets: ["localhost:9090"]
|
||||||
|
|
||||||
|
- job_name: "alertmanager"
|
||||||
|
scrape_interval: 5s
|
||||||
|
static_configs:
|
||||||
|
- targets: ["alertmanager:9093"]
|
||||||
|
|
||||||
|
- job_name: "node"
|
||||||
|
static_configs:
|
||||||
|
- targets: ["node-exporter:9100"]
|
||||||
|
|
||||||
|
- job_name: "cadvisor"
|
||||||
|
static_configs:
|
||||||
|
- targets: ["cadvisor:8080"]
|
||||||
|
|
||||||
|
- job_name: "node_nyyu"
|
||||||
|
scheme: https
|
||||||
|
metrics_path: /node/metrics
|
||||||
|
static_configs:
|
||||||
|
- targets: ["nyyu.dev:443"]
|
||||||
|
|
||||||
|
# - job_name: "smokeping"
|
||||||
|
# static_configs:
|
||||||
|
# - targets: ["smokeping:9374"]
|
||||||
|
|
||||||
|
- job_name: "blackbox"
|
||||||
|
metrics_path: /probe
|
||||||
|
params:
|
||||||
|
module: [http_2xx]
|
||||||
|
dns_sd_configs:
|
||||||
|
- names:
|
||||||
|
- nyyu.dev
|
||||||
|
type: A
|
||||||
|
port: 443
|
||||||
|
relabel_configs:
|
||||||
|
- source_labels: [__address__]
|
||||||
|
target_label: __param_target
|
||||||
|
replacement: https://$1/
|
||||||
|
- source_labels: [__param_target]
|
||||||
|
target_label: target
|
||||||
|
- target_label: __address__
|
||||||
|
replacement: blackbox:9115
|
||||||
|
- source_labels: [__meta_dns_name]
|
||||||
|
target_label: __param_hostname
|
||||||
|
- source_labels: [__meta_dns_name]
|
||||||
|
target_label: vhost
|
||||||
|
|
||||||
|
- job_name: "blackbox_icmp"
|
||||||
|
metrics_path: /probe
|
||||||
|
params:
|
||||||
|
module: [icmp]
|
||||||
|
static_configs:
|
||||||
|
- targets:
|
||||||
|
- 1.1.1.1
|
||||||
|
- nyyu.dev
|
||||||
|
relabel_configs:
|
||||||
|
- source_labels: [__address__]
|
||||||
|
target_label: __param_target
|
||||||
|
- source_labels: [__param_target]
|
||||||
|
target_label: target
|
||||||
|
- target_label: __address__
|
||||||
|
replacement: blackbox:9115
|
120
docker-compose.yml
Normal file
120
docker-compose.yml
Normal file
|
@ -0,0 +1,120 @@
|
||||||
|
version: '3.8'
|
||||||
|
|
||||||
|
networks:
|
||||||
|
monitoring:
|
||||||
|
driver: bridge
|
||||||
|
|
||||||
|
services:
|
||||||
|
grafana:
|
||||||
|
# ARM bug in rate interval : https://github.com/grafana/grafana/issues/43002
|
||||||
|
image: grafana/grafana:8.2.7
|
||||||
|
container_name: grafana
|
||||||
|
restart: unless-stopped
|
||||||
|
volumes:
|
||||||
|
- ./data/grafana:/var/lib/grafana
|
||||||
|
- ./conf/grafana/provisioning:/etc/grafana/provisioning
|
||||||
|
ports:
|
||||||
|
- 3000:3000
|
||||||
|
environment:
|
||||||
|
- GF_SECURITY_ADMIN_USER=admin
|
||||||
|
- GF_SECURITY_ADMIN_PASSWORD=${GRAFANA_PASSWORD}
|
||||||
|
- GF_USERS_ALLOW_SIGN_UP=false
|
||||||
|
networks:
|
||||||
|
- monitoring
|
||||||
|
prometheus:
|
||||||
|
image: prom/prometheus
|
||||||
|
container_name: prometheus
|
||||||
|
restart: unless-stopped
|
||||||
|
volumes:
|
||||||
|
- ./conf/prometheus:/etc/prometheus
|
||||||
|
- ./data/prometheus:/prometheus
|
||||||
|
ports:
|
||||||
|
- 9090:9090
|
||||||
|
command:
|
||||||
|
- '--config.file=/etc/prometheus/prometheus.yml'
|
||||||
|
- '--storage.tsdb.path=/prometheus'
|
||||||
|
- '--web.console.libraries=/etc/prometheus/console_libraries'
|
||||||
|
- '--web.console.templates=/etc/prometheus/consoles'
|
||||||
|
- '--web.enable-lifecycle'
|
||||||
|
- "--web.external-url=http://${EXTERNAL_IP}:9090"
|
||||||
|
expose:
|
||||||
|
- 9090
|
||||||
|
networks:
|
||||||
|
- monitoring
|
||||||
|
alertmanager:
|
||||||
|
image: prom/alertmanager
|
||||||
|
container_name: alertmanager
|
||||||
|
restart: unless-stopped
|
||||||
|
volumes:
|
||||||
|
- ./conf/alertmanager:/etc/alertmanager
|
||||||
|
- ./data/alertmanager:/alertmanager
|
||||||
|
ports:
|
||||||
|
- 9093:9093
|
||||||
|
command:
|
||||||
|
- '--config.file=/etc/alertmanager/alertmanager.yml'
|
||||||
|
- '--storage.path=/alertmanager'
|
||||||
|
- "--web.external-url=http://${EXTERNAL_IP}:9093"
|
||||||
|
expose:
|
||||||
|
- 9093
|
||||||
|
networks:
|
||||||
|
- monitoring
|
||||||
|
node-exporter:
|
||||||
|
image: prom/node-exporter
|
||||||
|
container_name: node-exporter
|
||||||
|
restart: unless-stopped
|
||||||
|
volumes:
|
||||||
|
- /proc:/host/proc:ro
|
||||||
|
- /sys:/host/sys:ro
|
||||||
|
- /:/rootfs:ro
|
||||||
|
command:
|
||||||
|
- '--path.procfs=/host/proc'
|
||||||
|
- '--path.rootfs=/rootfs'
|
||||||
|
- '--path.sysfs=/host/sys'
|
||||||
|
- '--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)'
|
||||||
|
expose:
|
||||||
|
- 9100
|
||||||
|
networks:
|
||||||
|
- monitoring
|
||||||
|
blackbox_exporter:
|
||||||
|
image: prom/blackbox-exporter
|
||||||
|
container_name: blackbox
|
||||||
|
restart: unless-stopped
|
||||||
|
expose:
|
||||||
|
- 9115
|
||||||
|
networks:
|
||||||
|
- monitoring
|
||||||
|
# smokeping:
|
||||||
|
# image: quay.io/superq/smokeping-prober
|
||||||
|
# container_name: smokeping
|
||||||
|
# restart: unless-stopped
|
||||||
|
# command: nyyu.dev
|
||||||
|
# privileged: true
|
||||||
|
# expose:
|
||||||
|
# - 9374
|
||||||
|
# networks:
|
||||||
|
# - monitoring
|
||||||
|
cadvisor:
|
||||||
|
# ARM image
|
||||||
|
image: justrobin/cadvisor:v0.44.0
|
||||||
|
container_name: cadvisor
|
||||||
|
restart: unless-stopped
|
||||||
|
command:
|
||||||
|
- '--housekeeping_interval=10s'
|
||||||
|
- '--raw_cgroup_prefix_whitelist=/docker/'
|
||||||
|
- '--disable_metrics=cpu_topology,hugetlb'
|
||||||
|
privileged: true
|
||||||
|
pid: 'host'
|
||||||
|
ports:
|
||||||
|
- '8040:8080'
|
||||||
|
volumes:
|
||||||
|
- '/:/rootfs:ro'
|
||||||
|
- '/var/run:/var/run:ro'
|
||||||
|
- '/sys:/sys:ro'
|
||||||
|
- '/var/lib/docker/:/var/lib/docker:ro'
|
||||||
|
- '/dev/disk/:/dev/disk:ro'
|
||||||
|
devices:
|
||||||
|
- '/dev/kmsg:/dev/kmsg'
|
||||||
|
expose:
|
||||||
|
- 8080
|
||||||
|
networks:
|
||||||
|
- monitoring
|
Loading…
Add table
Reference in a new issue