2018-05-08 12:10:29 +02:00
{
prometheusAlerts+:: {
groups+: [
{
2018-08-06 10:41:18 +02:00
name: 'node-exporter',
2018-05-08 12:10:29 +02:00
rules: [
{
alert: 'NodeFilesystemSpaceFillingUp',
expr: |||
2019-07-10 20:07:20 +02:00
(
2020-03-02 16:24:51 +01:00
node_filesystem_avail_bytes{%(nodeExporterSelector)s,%(fsSelector)s} / node_filesystem_size_bytes{%(nodeExporterSelector)s,%(fsSelector)s} * 100 < %(fsSpaceFillingUpWarningThreshold)d
2019-07-16 21:40:57 +02:00
and
predict_linear(node_filesystem_avail_bytes{%(nodeExporterSelector)s,%(fsSelector)s}[6h], 24*60*60) < 0
2018-05-10 10:35:35 +02:00
and
2019-07-10 20:09:01 +02:00
node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelector)s} == 0
2019-07-10 20:07:20 +02:00
)
2018-05-08 12:10:29 +02:00
||| % $._config,
'for': '1h',
labels: {
severity: 'warning',
},
annotations: {
2019-07-16 21:40:57 +02:00
summary: 'Filesystem is predicted to run out of space within the next 24 hours.',
description: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling up.',
2018-05-08 12:10:29 +02:00
},
},
{
alert: 'NodeFilesystemSpaceFillingUp',
expr: |||
2019-07-10 20:07:20 +02:00
(
2020-03-02 16:24:51 +01:00
node_filesystem_avail_bytes{%(nodeExporterSelector)s,%(fsSelector)s} / node_filesystem_size_bytes{%(nodeExporterSelector)s,%(fsSelector)s} * 100 < %(fsSpaceFillingUpCriticalThreshold)d
2019-07-16 21:40:57 +02:00
and
predict_linear(node_filesystem_avail_bytes{%(nodeExporterSelector)s,%(fsSelector)s}[6h], 4*60*60) < 0
2018-05-10 10:35:35 +02:00
and
2019-07-10 20:09:01 +02:00
node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelector)s} == 0
2019-07-10 20:07:20 +02:00
)
2018-05-08 12:10:29 +02:00
||| % $._config,
'for': '1h',
labels: {
2019-08-14 22:24:24 +02:00
severity: '%(nodeCriticalSeverity)s' % $._config,
2018-05-08 12:10:29 +02:00
},
annotations: {
2019-07-16 21:40:57 +02:00
summary: 'Filesystem is predicted to run out of space within the next 4 hours.',
description: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling up fast.',
2018-05-08 12:10:29 +02:00
},
},
{
2019-07-16 21:18:17 +02:00
alert: 'NodeFilesystemAlmostOutOfSpace',
2018-05-08 12:10:29 +02:00
expr: |||
2019-07-10 20:07:20 +02:00
(
2019-07-10 20:09:01 +02:00
node_filesystem_avail_bytes{%(nodeExporterSelector)s,%(fsSelector)s} / node_filesystem_size_bytes{%(nodeExporterSelector)s,%(fsSelector)s} * 100 < 5
2018-05-10 10:35:35 +02:00
and
2019-07-10 20:09:01 +02:00
node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelector)s} == 0
2019-07-10 20:07:20 +02:00
)
2018-05-08 12:10:29 +02:00
||| % $._config,
'for': '1h',
labels: {
severity: 'warning',
},
annotations: {
2019-07-16 21:40:57 +02:00
summary: 'Filesystem has less than 5% space left.',
description: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.',
2018-05-08 12:10:29 +02:00
},
},
{
2019-07-16 21:18:17 +02:00
alert: 'NodeFilesystemAlmostOutOfSpace',
2018-05-08 12:10:29 +02:00
expr: |||
2019-07-10 20:07:20 +02:00
(
2019-07-10 20:09:01 +02:00
node_filesystem_avail_bytes{%(nodeExporterSelector)s,%(fsSelector)s} / node_filesystem_size_bytes{%(nodeExporterSelector)s,%(fsSelector)s} * 100 < 3
2018-05-10 10:35:35 +02:00
and
2019-07-10 20:09:01 +02:00
node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelector)s} == 0
2019-07-10 20:07:20 +02:00
)
2018-05-08 12:10:29 +02:00
||| % $._config,
'for': '1h',
labels: {
2019-08-14 22:24:24 +02:00
severity: '%(nodeCriticalSeverity)s' % $._config,
2018-05-08 12:10:29 +02:00
},
annotations: {
2019-07-16 21:40:57 +02:00
summary: 'Filesystem has less than 3% space left.',
description: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left.',
2018-05-08 12:10:29 +02:00
},
},
{
alert: 'NodeFilesystemFilesFillingUp',
expr: |||
2019-07-10 20:07:20 +02:00
(
2019-09-11 10:47:24 +02:00
node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelector)s} / node_filesystem_files{%(nodeExporterSelector)s,%(fsSelector)s} * 100 < 40
2019-07-16 21:40:57 +02:00
and
predict_linear(node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelector)s}[6h], 24*60*60) < 0
2018-05-10 10:35:35 +02:00
and
2019-07-10 20:09:01 +02:00
node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelector)s} == 0
2019-07-10 20:07:20 +02:00
)
2018-05-08 12:10:29 +02:00
||| % $._config,
'for': '1h',
labels: {
severity: 'warning',
},
annotations: {
2019-07-16 21:40:57 +02:00
summary: 'Filesystem is predicted to run out of inodes within the next 24 hours.',
description: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up.',
2018-05-08 12:10:29 +02:00
},
},
{
alert: 'NodeFilesystemFilesFillingUp',
expr: |||
2019-07-10 20:07:20 +02:00
(
2019-09-11 10:47:24 +02:00
node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelector)s} / node_filesystem_files{%(nodeExporterSelector)s,%(fsSelector)s} * 100 < 20
2019-07-16 21:40:57 +02:00
and
predict_linear(node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelector)s}[6h], 4*60*60) < 0
2018-05-10 10:35:35 +02:00
and
2019-07-10 20:09:01 +02:00
node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelector)s} == 0
2019-07-10 20:07:20 +02:00
)
2018-05-08 12:10:29 +02:00
||| % $._config,
'for': '1h',
labels: {
2019-08-14 22:24:24 +02:00
severity: '%(nodeCriticalSeverity)s' % $._config,
2018-05-08 12:10:29 +02:00
},
annotations: {
2019-07-16 21:40:57 +02:00
summary: 'Filesystem is predicted to run out of inodes within the next 4 hours.',
description: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up fast.',
2018-05-08 12:10:29 +02:00
},
},
{
2019-07-16 21:18:17 +02:00
alert: 'NodeFilesystemAlmostOutOfFiles',
2018-05-08 12:10:29 +02:00
expr: |||
2019-07-10 20:07:20 +02:00
(
2019-07-10 20:09:01 +02:00
node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelector)s} / node_filesystem_files{%(nodeExporterSelector)s,%(fsSelector)s} * 100 < 5
2018-05-10 10:35:35 +02:00
and
2019-07-10 20:09:01 +02:00
node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelector)s} == 0
2019-07-10 20:07:20 +02:00
)
2018-05-08 12:10:29 +02:00
||| % $._config,
'for': '1h',
labels: {
severity: 'warning',
},
annotations: {
2019-07-16 21:40:57 +02:00
summary: 'Filesystem has less than 5% inodes left.',
description: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left.',
2018-05-08 12:10:29 +02:00
},
},
{
2019-07-16 21:18:17 +02:00
alert: 'NodeFilesystemAlmostOutOfFiles',
2018-05-08 12:10:29 +02:00
expr: |||
2019-07-10 20:07:20 +02:00
(
2019-07-10 20:09:01 +02:00
node_filesystem_files_free{%(nodeExporterSelector)s,%(fsSelector)s} / node_filesystem_files{%(nodeExporterSelector)s,%(fsSelector)s} * 100 < 3
2018-05-10 10:35:35 +02:00
and
2019-07-10 20:09:01 +02:00
node_filesystem_readonly{%(nodeExporterSelector)s,%(fsSelector)s} == 0
2019-07-10 20:07:20 +02:00
)
2018-05-08 12:10:29 +02:00
||| % $._config,
'for': '1h',
labels: {
2019-08-14 22:24:24 +02:00
severity: '%(nodeCriticalSeverity)s' % $._config,
2018-05-08 12:10:29 +02:00
},
annotations: {
2019-07-16 21:40:57 +02:00
summary: 'Filesystem has less than 3% inodes left.',
description: 'Filesystem on {{ $labels.device }} at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left.',
2018-05-08 12:10:29 +02:00
},
},
{
alert: 'NodeNetworkReceiveErrs',
expr: |||
2018-11-19 16:11:37 +01:00
increase(node_network_receive_errs_total[2m]) > 10
2018-05-08 12:10:29 +02:00
||| % $._config,
'for': '1h',
labels: {
2019-07-16 21:18:17 +02:00
severity: 'warning',
2018-05-08 12:10:29 +02:00
},
annotations: {
2019-07-16 21:40:57 +02:00
summary: 'Network interface is reporting many receive errors.',
description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} receive errors in the last two minutes.',
2018-05-08 12:10:29 +02:00
},
},
{
alert: 'NodeNetworkTransmitErrs',
expr: |||
2018-11-19 16:11:37 +01:00
increase(node_network_transmit_errs_total[2m]) > 10
2018-05-08 12:10:29 +02:00
||| % $._config,
'for': '1h',
labels: {
2019-07-16 21:18:17 +02:00
severity: 'warning',
2018-05-08 12:10:29 +02:00
},
annotations: {
2019-07-16 21:40:57 +02:00
summary: 'Network interface is reporting many transmit errors.',
description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} transmit errors in the last two minutes.',
2018-05-08 12:10:29 +02:00
},
},
2020-03-05 07:55:11 +01:00
{
alert: 'NodeHighNumberConntrackEntriesUsed',
expr: |||
(node_nf_conntrack_entries / node_nf_conntrack_entries_limit) > 0.75
||| % $._config,
annotations: {
2020-03-21 13:03:04 +01:00
summary: 'Number of conntrack are getting close to the limit',
2020-03-05 07:55:11 +01:00
description: '{{ $value | humanizePercentage }} of conntrack entries are used',
},
labels: {
severity: 'warning',
},
},
2019-09-10 16:52:12 +02:00
{
alert: 'NodeClockSkewDetected',
expr: |||
(
node_timex_offset_seconds > 0.05
and
deriv(node_timex_offset_seconds[5m]) >= 0
)
or
(
node_timex_offset_seconds < 0.05
and
deriv(node_timex_offset_seconds[5m]) <= 0
)
||| % $._config,
'for': '10m',
labels: {
severity: 'warning',
},
annotations: {
summary: 'Clock skew detected.',
message: 'Clock on {{ $labels.instance }} is out of sync by more than 300s. Ensure NTP is configured correctly on this host.',
},
},
{
alert: 'NodeClockNotSynchronising',
expr: |||
min_over_time(node_timex_sync_status[5m]) == 0
||| % $._config,
'for': '10m',
labels: {
severity: 'warning',
},
annotations: {
summary: 'Clock not synchronising.',
message: 'Clock on {{ $labels.instance }} is not synchronising. Ensure NTP is configured on this host.',
},
},
2018-05-08 12:10:29 +02:00
],
},
],
},
}