mirror of
https://github.com/VictoriaMetrics/VictoriaMetrics.git
synced 2025-01-05 14:22:15 +01:00
a99d26633b
Each Grafana dashboard has unique ID which can be used to fetch the dashboard
from grafana.com: https://grafana.com/grafana/dashboards/11176
The same dashboard can be accessed via URL with slug: https://grafana.com/grafana/dashboards/11176-victoriametrics-cluster/
But using slug implies that any change to dashboard name will break the link.
So it is better to just use ID, so the dashboard URL will never break.
This is follow-up for ff33e60a3d
Signed-off-by: hagen1778 <roman@victoriametrics.com>
138 lines
7.2 KiB
YAML
138 lines
7.2 KiB
YAML
# File contains default list of alerts for vmagent service.
|
|
# The alerts below are just recommendations and may require some updates
|
|
# and threshold calibration according to every specific setup.
|
|
groups:
|
|
# Alerts group for vmagent assumes that Grafana dashboard
|
|
# https://grafana.com/grafana/dashboards/12683/ is installed.
|
|
# Pls update the `dashboard` annotation according to your setup.
|
|
- name: vmagent
|
|
interval: 30s
|
|
concurrency: 2
|
|
rules:
|
|
- alert: PersistentQueueIsDroppingData
|
|
expr: sum(increase(vm_persistentqueue_bytes_dropped_total[5m])) without (path) > 0
|
|
for: 10m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
dashboard: "http://localhost:3000/d/G7Z9GzMGz?viewPanel=49&var-instance={{ $labels.instance }}"
|
|
summary: "Instance {{ $labels.instance }} is dropping data from persistent queue"
|
|
description: "Vmagent dropped {{ $value | humanize1024 }} from persistent queue
|
|
on instance {{ $labels.instance }} for the last 10m."
|
|
|
|
- alert: RejectedRemoteWriteDataBlocksAreDropped
|
|
expr: sum(increase(vmagent_remotewrite_packets_dropped_total[5m])) without (url) > 0
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
dashboard: "http://localhost:3000/d/G7Z9GzMGz?viewPanel=79&var-instance={{ $labels.instance }}"
|
|
summary: "Job \"{{ $labels.job }}\" on instance {{ $labels.instance }} drops the rejected by
|
|
remote-write server data blocks. Check the logs to find the reason for rejects."
|
|
|
|
- alert: TooManyScrapeErrors
|
|
expr: increase(vm_promscrape_scrapes_failed_total[5m]) > 0
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
dashboard: "http://localhost:3000/d/G7Z9GzMGz?viewPanel=31&var-instance={{ $labels.instance }}"
|
|
summary: "Job \"{{ $labels.job }}\" on instance {{ $labels.instance }} fails to scrape targets for last 15m"
|
|
|
|
- alert: TooManyWriteErrors
|
|
expr: |
|
|
(sum(increase(vm_ingestserver_request_errors_total[5m])) without (name,net,type)
|
|
+
|
|
sum(increase(vmagent_http_request_errors_total[5m])) without (path,protocol)) > 0
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
dashboard: "http://localhost:3000/d/G7Z9GzMGz?viewPanel=77&var-instance={{ $labels.instance }}"
|
|
summary: "Job \"{{ $labels.job }}\" on instance {{ $labels.instance }} responds with errors to write requests for last 15m."
|
|
|
|
- alert: TooManyRemoteWriteErrors
|
|
expr: rate(vmagent_remotewrite_retries_count_total[5m]) > 0
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
dashboard: "http://localhost:3000/d/G7Z9GzMGz?viewPanel=61&var-instance={{ $labels.instance }}"
|
|
summary: "Job \"{{ $labels.job }}\" on instance {{ $labels.instance }} fails to push to remote storage"
|
|
description: "Vmagent fails to push data via remote write protocol to destination \"{{ $labels.url }}\"\n
|
|
Ensure that destination is up and reachable."
|
|
|
|
- alert: RemoteWriteConnectionIsSaturated
|
|
expr: |
|
|
(
|
|
rate(vmagent_remotewrite_send_duration_seconds_total[5m])
|
|
/
|
|
vmagent_remotewrite_queues
|
|
) > 0.9
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
dashboard: "http://localhost:3000/d/G7Z9GzMGz?viewPanel=84&var-instance={{ $labels.instance }}"
|
|
summary: "Remote write connection from \"{{ $labels.job }}\" (instance {{ $labels.instance }}) to {{ $labels.url }} is saturated"
|
|
description: "The remote write connection between vmagent \"{{ $labels.job }}\" (instance {{ $labels.instance }}) and destination \"{{ $labels.url }}\"
|
|
is saturated by more than 90% and vmagent won't be able to keep up.\n
|
|
This usually means that `-remoteWrite.queues` command-line flag must be increased in order to increase
|
|
the number of connections per each remote storage."
|
|
|
|
- alert: PersistentQueueForWritesIsSaturated
|
|
expr: rate(vm_persistentqueue_write_duration_seconds_total[5m]) > 0.9
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
dashboard: "http://localhost:3000/d/G7Z9GzMGz?viewPanel=98&var-instance={{ $labels.instance }}"
|
|
summary: "Persistent queue writes for instance {{ $labels.instance }} are saturated"
|
|
description: "Persistent queue writes for vmagent \"{{ $labels.job }}\" (instance {{ $labels.instance }})
|
|
are saturated by more than 90% and vmagent won't be able to keep up with flushing data on disk.
|
|
In this case, consider to decrease load on the vmagent or improve the disk throughput."
|
|
|
|
- alert: PersistentQueueForReadsIsSaturated
|
|
expr: rate(vm_persistentqueue_read_duration_seconds_total[5m]) > 0.9
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
dashboard: "http://localhost:3000/d/G7Z9GzMGz?viewPanel=99&var-instance={{ $labels.instance }}"
|
|
summary: "Persistent queue reads for instance {{ $labels.instance }} are saturated"
|
|
description: "Persistent queue reads for vmagent \"{{ $labels.job }}\" (instance {{ $labels.instance }})
|
|
are saturated by more than 90% and vmagent won't be able to keep up with reading data from the disk.
|
|
In this case, consider to decrease load on the vmagent or improve the disk throughput."
|
|
|
|
- alert: SeriesLimitHourReached
|
|
expr: (vmagent_hourly_series_limit_current_series / vmagent_hourly_series_limit_max_series) > 0.9
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
dashboard: "http://localhost:3000/d/G7Z9GzMGz?viewPanel=88&var-instance={{ $labels.instance }}"
|
|
summary: "Instance {{ $labels.instance }} reached 90% of the limit"
|
|
description: "Max series limit set via -remoteWrite.maxHourlySeries flag is close to reaching the max value.
|
|
Then samples for new time series will be dropped instead of sending them to remote storage systems."
|
|
|
|
- alert: SeriesLimitDayReached
|
|
expr: (vmagent_daily_series_limit_current_series / vmagent_daily_series_limit_max_series) > 0.9
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
dashboard: "http://localhost:3000/d/G7Z9GzMGz?viewPanel=90&var-instance={{ $labels.instance }}"
|
|
summary: "Instance {{ $labels.instance }} reached 90% of the limit"
|
|
description: "Max series limit set via -remoteWrite.maxDailySeries flag is close to reaching the max value.
|
|
Then samples for new time series will be dropped instead of sending them to remote storage systems."
|
|
|
|
- alert: ConfigurationReloadFailure
|
|
expr: |
|
|
vm_promscrape_config_last_reload_successful != 1
|
|
or
|
|
vmagent_relabel_config_last_reload_successful != 1
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Configuration reload failed for vmagent instance {{ $labels.instance }}"
|
|
description: "Configuration hot-reload failed for vmagent on instance {{ $labels.instance }}.
|
|
Check vmagent's logs for detailed error message."
|