mirror of
https://github.com/VictoriaMetrics/VictoriaMetrics.git
synced 2025-01-23 00:30:57 +01:00
8fb68152e6
This is follow-up after
75196d7234
It updates some of the alerting rules to remove unnecessary aggregations.
It keeps aggregations for expressions which are using multiple time series
filters to make sure their label will match.
Signed-off-by: hagen1778 <roman@victoriametrics.com>
90 lines
4.8 KiB
YAML
90 lines
4.8 KiB
YAML
# File contains default list of alerts for various VM components.
|
|
# The following alerts are recommended for use for any VM installation.
|
|
# The alerts below are just recommendations and may require some updates
|
|
# and threshold calibration according to every specific setup.
|
|
groups:
|
|
- name: vm-health
|
|
# note the `job` filter and update accordingly to your setup
|
|
rules:
|
|
- alert: TooManyRestarts
|
|
expr: changes(process_start_time_seconds{job=~".*(victoriametrics|vmselect|vminsert|vmstorage|vmagent|vmalert|vmsingle|vmalertmanager|vmauth).*"}[15m]) > 2
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "{{ $labels.job }} too many restarts (instance {{ $labels.instance }})"
|
|
description: "Job {{ $labels.job }} (instance {{ $labels.instance }}) has restarted more than twice in the last 15 minutes.
|
|
It might be crashlooping."
|
|
|
|
- alert: ServiceDown
|
|
expr: up{job=~".*(victoriametrics|vmselect|vminsert|vmstorage|vmagent|vmalert|vmsingle|vmalertmanager|vmauth).*"} == 0
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Service {{ $labels.job }} is down on {{ $labels.instance }}"
|
|
description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 2 minutes."
|
|
|
|
- alert: ProcessNearFDLimits
|
|
expr: (process_max_fds - process_open_fds) < 100
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Number of free file descriptors is less than 100 for \"{{ $labels.job }}\"(\"{{ $labels.instance }}\") for the last 5m"
|
|
description: "Exhausting OS file descriptors limit can cause severe degradation of the process.
|
|
Consider to increase the limit as fast as possible."
|
|
|
|
- alert: TooHighMemoryUsage
|
|
expr: (min_over_time(process_resident_memory_anon_bytes[10m]) / vm_available_memory_bytes) > 0.8
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "It is more than 80% of memory used by \"{{ $labels.job }}\"(\"{{ $labels.instance }}\")"
|
|
description: "Too high memory usage may result into multiple issues such as OOMs or degraded performance.
|
|
Consider to either increase available memory or decrease the load on the process."
|
|
|
|
- alert: TooHighCPUUsage
|
|
expr: rate(process_cpu_seconds_total[5m]) / process_cpu_cores_available > 0.9
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "More than 90% of CPU is used by \"{{ $labels.job }}\"(\"{{ $labels.instance }}\") during the last 5m"
|
|
description: "Too high CPU usage may be a sign of insufficient resources and make process unstable.
|
|
Consider to either increase available CPU resources or decrease the load on the process."
|
|
|
|
- alert: TooManyLogs
|
|
expr: sum(increase(vm_log_messages_total{level="error"}[5m])) without (app_version, location) > 0
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Too many logs printed for job \"{{ $labels.job }}\" ({{ $labels.instance }})"
|
|
description: "Logging rate for job \"{{ $labels.job }}\" ({{ $labels.instance }}) is {{ $value }} for last 15m.\n
|
|
Worth to check logs for specific error messages."
|
|
|
|
- alert: TooManyTSIDMisses
|
|
expr: rate(vm_missing_tsids_for_metric_id_total[5m]) > 0
|
|
for: 10m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Too many TSID misses for job \"{{ $labels.job }}\" ({{ $labels.instance }})"
|
|
description: "The rate of TSID misses during query lookups is too high for \"{{ $labels.job }}\" ({{ $labels.instance }}).\n
|
|
Make sure you're running VictoriaMetrics of v1.85.3 or higher.\n
|
|
Related issue https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3502"
|
|
|
|
- alert: ConcurrentInsertsHitTheLimit
|
|
expr: avg_over_time(vm_concurrent_insert_current[1m]) >= vm_concurrent_insert_capacity
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "{{ $labels.job }} on instance {{ $labels.instance }} is constantly hitting concurrent inserts limit"
|
|
description: "The limit of concurrent inserts on instance {{ $labels.instance }} depends on the number of CPUs.\n
|
|
Usually, when component constantly hits the limit it is likely the component is overloaded and requires more CPU.
|
|
In some cases for components like vmagent or vminsert the alert might trigger if there are too many clients
|
|
making write attempts. If vmagent's or vminsert's CPU usage and network saturation are at normal level, then
|
|
it might be worth adjusting `-maxConcurrentInserts` cmd-line flag."
|