mirror of
https://github.com/VictoriaMetrics/VictoriaMetrics.git
synced 2024-12-30 15:50:07 +01:00
a99d26633b
Each Grafana dashboard has unique ID which can be used to fetch the dashboard
from grafana.com: https://grafana.com/grafana/dashboards/11176
The same dashboard can be accessed via URL with slug: https://grafana.com/grafana/dashboards/11176-victoriametrics-cluster/
But using slug implies that any change to dashboard name will break the link.
So it is better to just use ID, so the dashboard URL will never break.
This is follow-up for ff33e60a3d
Signed-off-by: hagen1778 <roman@victoriametrics.com>
86 lines
4.5 KiB
YAML
86 lines
4.5 KiB
YAML
# File contains default list of alerts for vmalert service.
|
|
# The alerts below are just recommendations and may require some updates
|
|
# and threshold calibration according to every specific setup.
|
|
groups:
|
|
# Alerts group for vmalert assumes that Grafana dashboard
|
|
# https://grafana.com/grafana/dashboards/14950/ is installed.
|
|
# Pls update the `dashboard` annotation according to your setup.
|
|
- name: vmalert
|
|
interval: 30s
|
|
rules:
|
|
- alert: ConfigurationReloadFailure
|
|
expr: vmalert_config_last_reload_successful != 1
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Configuration reload failed for vmalert instance {{ $labels.instance }}"
|
|
description: "Configuration hot-reload failed for vmalert on instance {{ $labels.instance }}.
|
|
Check vmalert's logs for detailed error message."
|
|
|
|
- alert: AlertingRulesError
|
|
expr: sum(increase(vmalert_alerting_rules_errors_total[5m])) without(alertname, id) > 0
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
dashboard: "http://localhost:3000/d/LzldHAVnz?viewPanel=13&var-instance={{ $labels.instance }}&var-group={{ $labels.group }}"
|
|
summary: "Alerting rules are failing for vmalert instance {{ $labels.instance }}"
|
|
description: "Alerting rules execution is failing for group \"{{ $labels.group }}\".
|
|
Check vmalert's logs for detailed error message."
|
|
|
|
- alert: RecordingRulesError
|
|
expr: sum(increase(vmalert_recording_rules_errors_total[5m])) without(recording, id) > 0
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
dashboard: "http://localhost:3000/d/LzldHAVnz?viewPanel=30&var-instance={{ $labels.instance }}&var-group={{ $labels.group }}"
|
|
summary: "Recording rules are failing for vmalert instance {{ $labels.instance }}"
|
|
description: "Recording rules execution is failing for group \"{{ $labels.group }}\".
|
|
Check vmalert's logs for detailed error message."
|
|
|
|
- alert: RecordingRulesNoData
|
|
expr: sum(vmalert_recording_rules_last_evaluation_samples) without(recording, id) < 1
|
|
for: 30m
|
|
labels:
|
|
severity: info
|
|
annotations:
|
|
dashboard: "http://localhost:3000/d/LzldHAVnz?viewPanel=33&var-group={{ $labels.group }}"
|
|
summary: "Recording rule {{ $labels.recording }} ({ $labels.group }}) produces no data"
|
|
description: "Recording rule \"{{ $labels.recording }}\" from group \"{{ $labels.group }}\"
|
|
produces 0 samples over the last 30min. It might be caused by a misconfiguration
|
|
or incorrect query expression."
|
|
|
|
- alert: TooManyMissedIterations
|
|
expr: increase(vmalert_iteration_missed_total[5m]) > 0
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "vmalert instance {{ $labels.instance }} is missing rules evaluations"
|
|
description: "vmalert instance {{ $labels.instance }} is missing rules evaluations for group \"{{ $labels.group }}\".
|
|
The group evaluation time takes longer than the configured evaluation interval. This may result in missed
|
|
alerting notifications or recording rules samples. Try increasing evaluation interval or concurrency of
|
|
group \"{{ $labels.group }}\". See https://docs.victoriametrics.com/vmalert.html#groups.
|
|
If rule expressions are taking longer than expected, please see https://docs.victoriametrics.com/Troubleshooting.html#slow-queries."
|
|
|
|
- alert: RemoteWriteErrors
|
|
expr: increase(vmalert_remotewrite_errors_total[5m]) > 0
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "vmalert instance {{ $labels.instance }} is failing to push metrics to remote write URL"
|
|
description: "vmalert instance {{ $labels.instance }} is failing to push metrics generated via alerting
|
|
or recording rules to the configured remote write URL. Check vmalert's logs for detailed error message."
|
|
|
|
- alert: AlertmanagerErrors
|
|
expr: increase(vmalert_alerts_send_errors_total[5m]) > 0
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "vmalert instance {{ $labels.instance }} is failing to send notifications to Alertmanager"
|
|
description: "vmalert instance {{ $labels.instance }} is failing to send alert notifications to \"{{ $labels.addr }}\".
|
|
Check vmalert's logs for detailed error message."
|