mirror of
https://github.com/prometheus/node_exporter.git
synced 2024-11-23 20:36:21 +01:00
Add thresholds for memory, disk and system alerts
Signed-off-by: Vitaly Zhuravlev <v-zhuravlev@users.noreply.github.com>
This commit is contained in:
parent
77ae769179
commit
6bdc1d9c98
@ -327,7 +327,7 @@
|
|||||||
alert: 'NodeSystemSaturation',
|
alert: 'NodeSystemSaturation',
|
||||||
expr: |||
|
expr: |||
|
||||||
node_load1{%(nodeExporterSelector)s}
|
node_load1{%(nodeExporterSelector)s}
|
||||||
/ count without (cpu, mode) (node_cpu_seconds_total{%(nodeExporterSelector)s, mode="idle"}) > 2
|
/ count without (cpu, mode) (node_cpu_seconds_total{%(nodeExporterSelector)s, mode="idle"}) > %(systemSaturationPerCoreThreshold)d
|
||||||
||| % $._config,
|
||| % $._config,
|
||||||
'for': '15m',
|
'for': '15m',
|
||||||
labels: {
|
labels: {
|
||||||
@ -336,15 +336,15 @@
|
|||||||
annotations: {
|
annotations: {
|
||||||
summary: 'System saturated, load per core is very high.',
|
summary: 'System saturated, load per core is very high.',
|
||||||
description: |||
|
description: |||
|
||||||
System load per core at {{ $labels.instance }} has been above 2 for the last 15 minutes, is currently at {{ printf "%.2f" $value }}.
|
System load per core at {{ $labels.instance }} has been above %(systemSaturationPerCoreThreshold)d for the last 15 minutes, is currently at {{ printf "%.2f" $value }}.
|
||||||
This might indicate this instance resources saturation and can cause it becoming unresponsive.
|
This might indicate this instance resources saturation and can cause it becoming unresponsive.
|
||||||
|||,
|
||| % $._config,
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
alert: 'NodeMemoryMajorPagesFaults',
|
alert: 'NodeMemoryMajorPagesFaults',
|
||||||
expr: |||
|
expr: |||
|
||||||
rate(node_vmstat_pgmajfault{%(nodeExporterSelector)s}[5m]) > %(memoryMajorPagesFaultsWarningThreshold)s
|
rate(node_vmstat_pgmajfault{%(nodeExporterSelector)s}[5m]) > %(memoryMajorPagesFaultsThreshold)d
|
||||||
||| % $._config,
|
||| % $._config,
|
||||||
'for': '15m',
|
'for': '15m',
|
||||||
labels: {
|
labels: {
|
||||||
@ -353,7 +353,7 @@
|
|||||||
annotations: {
|
annotations: {
|
||||||
summary: 'Memory major page faults are occurring at very high rate.',
|
summary: 'Memory major page faults are occurring at very high rate.',
|
||||||
description: |||
|
description: |||
|
||||||
Memory major pages are occurring at very high rate at {{ $labels.instance }}, %(memoryMajorPagesFaultsWarningThreshold)s major page faults per second for the last 15 minutes, is currently at {{ printf "%.2f" $value }}.
|
Memory major pages are occurring at very high rate at {{ $labels.instance }}, %(memoryMajorPagesFaultsThreshold)d major page faults per second for the last 15 minutes, is currently at {{ printf "%.2f" $value }}.
|
||||||
Please check that there is enough memory available at this instance.
|
Please check that there is enough memory available at this instance.
|
||||||
||| % $._config,
|
||| % $._config,
|
||||||
},
|
},
|
||||||
@ -361,7 +361,7 @@
|
|||||||
{
|
{
|
||||||
alert: 'NodeMemoryHighUtilization',
|
alert: 'NodeMemoryHighUtilization',
|
||||||
expr: |||
|
expr: |||
|
||||||
100 - (node_memory_MemAvailable_bytes{%(nodeExporterSelector)s} / node_memory_MemTotal_bytes{%(nodeExporterSelector)s} * 100) > %(memoryHighUtilizationThreshold)s
|
100 - (node_memory_MemAvailable_bytes{%(nodeExporterSelector)s} / node_memory_MemTotal_bytes{%(nodeExporterSelector)s} * 100) > %(memoryHighUtilizationThreshold)d
|
||||||
||| % $._config,
|
||| % $._config,
|
||||||
'for': '15m',
|
'for': '15m',
|
||||||
labels: {
|
labels: {
|
||||||
@ -370,14 +370,14 @@
|
|||||||
annotations: {
|
annotations: {
|
||||||
summary: 'Host is running out of memory.',
|
summary: 'Host is running out of memory.',
|
||||||
description: |||
|
description: |||
|
||||||
Memory is filling up at {{ $labels.instance }}, has been above %(memoryHighUtilizationThreshold)s% for the last 15 minutes, is currently at {{ printf "%.2f" $value }}%.
|
Memory is filling up at {{ $labels.instance }}, has been above %(memoryHighUtilizationThreshold)d%% for the last 15 minutes, is currently at {{ printf "%.2f" $value }}%.
|
||||||
|||,
|
||| % $._config,
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
alert: 'NodeDiskIOSaturation',
|
alert: 'NodeDiskIOSaturation',
|
||||||
expr: |||
|
expr: |||
|
||||||
rate(node_disk_io_time_weighted_seconds_total{%(nodeExporterSelector)s, %(diskDeviceSelector)s}[5m]) > 10
|
rate(node_disk_io_time_weighted_seconds_total{%(nodeExporterSelector)s, %(diskDeviceSelector)s}[5m]) > %(diskIOSaturationThreshold)d
|
||||||
||| % $._config,
|
||| % $._config,
|
||||||
'for': '30m',
|
'for': '30m',
|
||||||
labels: {
|
labels: {
|
||||||
@ -386,9 +386,9 @@
|
|||||||
annotations: {
|
annotations: {
|
||||||
summary: 'Disk IO queue is high.',
|
summary: 'Disk IO queue is high.',
|
||||||
description: |||
|
description: |||
|
||||||
Disk IO queue (aqu-sq) is high on {{ $labels.device }} at {{ $labels.instance }}, has been above 10 for the last 15 minutes, is currently at {{ printf "%.2f" $value }}.
|
Disk IO queue (aqu-sq) is high on {{ $labels.device }} at {{ $labels.instance }}, has been above %(diskIOSaturationThreshold)d for the last 15 minutes, is currently at {{ printf "%.2f" $value }}.
|
||||||
This symptom might indicate disk saturation.
|
This symptom might indicate disk saturation.
|
||||||
|||,
|
||| % $._config,
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
@ -43,6 +43,11 @@
|
|||||||
// just a warning for K8s nodes.
|
// just a warning for K8s nodes.
|
||||||
nodeCriticalSeverity: 'critical',
|
nodeCriticalSeverity: 'critical',
|
||||||
|
|
||||||
|
|
||||||
|
// Load average 1m (per core) on which to trigger the
|
||||||
|
// 'NodeSystemSaturation' alert.
|
||||||
|
systemSaturationPerCoreThreshold: 2,
|
||||||
|
|
||||||
// Available disk space (%) thresholds on which to trigger the
|
// Available disk space (%) thresholds on which to trigger the
|
||||||
// 'NodeFilesystemSpaceFillingUp' alerts. These alerts fire if the disk
|
// 'NodeFilesystemSpaceFillingUp' alerts. These alerts fire if the disk
|
||||||
// usage grows in a way that it is predicted to run out in 4h or 1d
|
// usage grows in a way that it is predicted to run out in 4h or 1d
|
||||||
@ -66,7 +71,11 @@
|
|||||||
|
|
||||||
// Threshold for the rate of memory major page faults to trigger
|
// Threshold for the rate of memory major page faults to trigger
|
||||||
// 'NodeMemoryMajorPagesFaults' alert.
|
// 'NodeMemoryMajorPagesFaults' alert.
|
||||||
memoryMajorPagesFaultsWarningThreshold: 500,
|
memoryMajorPagesFaultsThreshold: 500,
|
||||||
|
|
||||||
|
// Disk IO queue level above which to trigger
|
||||||
|
// 'NodeDiskIOSaturation' alert.
|
||||||
|
diskIOSaturationThreshold: 10,
|
||||||
|
|
||||||
rateInterval: '5m',
|
rateInterval: '5m',
|
||||||
// Opt-in for multi-cluster support.
|
// Opt-in for multi-cluster support.
|
||||||
|
Loading…
Reference in New Issue
Block a user