2022-09-21 10:48:38 +02:00
# File contains default list of alerts for vmagent service.
# The alerts below are just recommendations and may require some updates
# and threshold calibration according to every specific setup.
groups :
# Alerts group for vmagent assumes that Grafana dashboard
2024-01-18 11:19:53 +01:00
# https://grafana.com/grafana/dashboards/12683/ is installed.
2022-09-21 10:48:38 +02:00
# Pls update the `dashboard` annotation according to your setup.
- name : vmagent
interval : 30s
concurrency : 2
rules :
- alert : PersistentQueueIsDroppingData
2023-12-11 15:01:29 +01:00
expr : sum(increase(vm_persistentqueue_bytes_dropped_total[5m])) without (path) > 0
2022-09-21 10:48:38 +02:00
for : 10m
labels :
severity : critical
annotations :
dashboard : "http://localhost:3000/d/G7Z9GzMGz?viewPanel=49&var-instance={{ $labels.instance }}"
summary : "Instance {{ $labels.instance }} is dropping data from persistent queue"
description : "Vmagent dropped {{ $value | humanize1024 }} from persistent queue
on instance {{ $labels.instance }} for the last 10m."
- alert : RejectedRemoteWriteDataBlocksAreDropped
2023-12-11 15:01:29 +01:00
expr : sum(increase(vmagent_remotewrite_packets_dropped_total[5m])) without (url) > 0
2022-09-21 10:48:38 +02:00
for : 15m
labels :
severity : warning
annotations :
dashboard : "http://localhost:3000/d/G7Z9GzMGz?viewPanel=79&var-instance={{ $labels.instance }}"
summary : "Job \"{{ $labels.job }}\" on instance {{ $labels.instance }} drops the rejected by
remote-write server data blocks. Check the logs to find the reason for rejects."
- alert : TooManyScrapeErrors
2023-12-11 15:01:29 +01:00
expr : increase(vm_promscrape_scrapes_failed_total[5m]) > 0
2022-09-21 10:48:38 +02:00
for : 15m
labels :
severity : warning
annotations :
dashboard : "http://localhost:3000/d/G7Z9GzMGz?viewPanel=31&var-instance={{ $labels.instance }}"
summary : "Job \"{{ $labels.job }}\" on instance {{ $labels.instance }} fails to scrape targets for last 15m"
- alert : TooManyWriteErrors
expr : |
2023-12-11 15:01:29 +01:00
(sum(increase(vm_ingestserver_request_errors_total[5m])) without (name,net,type)
2022-09-21 10:48:38 +02:00
+
2023-12-11 15:01:29 +01:00
sum(increase(vmagent_http_request_errors_total[5m])) without (path,protocol)) > 0
2022-09-21 10:48:38 +02:00
for : 15m
labels :
severity : warning
annotations :
dashboard : "http://localhost:3000/d/G7Z9GzMGz?viewPanel=77&var-instance={{ $labels.instance }}"
summary : "Job \"{{ $labels.job }}\" on instance {{ $labels.instance }} responds with errors to write requests for last 15m."
- alert : TooManyRemoteWriteErrors
2023-12-11 15:01:29 +01:00
expr : rate(vmagent_remotewrite_retries_count_total[5m]) > 0
2022-09-21 10:48:38 +02:00
for : 15m
labels :
severity : warning
annotations :
dashboard : "http://localhost:3000/d/G7Z9GzMGz?viewPanel=61&var-instance={{ $labels.instance }}"
summary : "Job \"{{ $labels.job }}\" on instance {{ $labels.instance }} fails to push to remote storage"
description : "Vmagent fails to push data via remote write protocol to destination \"{{ $labels.url }}\" \n
Ensure that destination is up and reachable."
- alert : RemoteWriteConnectionIsSaturated
expr : |
2023-10-24 09:31:57 +02:00
(
2023-12-11 15:01:29 +01:00
rate(vmagent_remotewrite_send_duration_seconds_total[5m])
2023-10-24 09:31:57 +02:00
/
2023-12-11 15:01:29 +01:00
vmagent_remotewrite_queues
2023-10-24 09:31:57 +02:00
) > 0.9
2022-09-21 10:48:38 +02:00
for : 15m
labels :
severity : warning
annotations :
dashboard : "http://localhost:3000/d/G7Z9GzMGz?viewPanel=84&var-instance={{ $labels.instance }}"
summary : "Remote write connection from \"{{ $labels.job }}\" (instance {{ $labels.instance }}) to {{ $labels.url }} is saturated"
description : "The remote write connection between vmagent \"{{ $labels.job }}\" (instance {{ $labels.instance }}) and destination \"{{ $labels.url }}\"
is saturated by more than 90% and vmagent won't be able to keep up.\n
This usually means that `-remoteWrite.queues` command-line flag must be increased in order to increase
the number of connections per each remote storage."
- alert : PersistentQueueForWritesIsSaturated
expr : rate(vm_persistentqueue_write_duration_seconds_total[5m]) > 0.9
for : 15m
labels :
severity : warning
annotations :
dashboard : "http://localhost:3000/d/G7Z9GzMGz?viewPanel=98&var-instance={{ $labels.instance }}"
summary : "Persistent queue writes for instance {{ $labels.instance }} are saturated"
description : "Persistent queue writes for vmagent \"{{ $labels.job }}\" (instance {{ $labels.instance }})
are saturated by more than 90% and vmagent won't be able to keep up with flushing data on disk.
In this case, consider to decrease load on the vmagent or improve the disk throughput."
- alert : PersistentQueueForReadsIsSaturated
expr : rate(vm_persistentqueue_read_duration_seconds_total[5m]) > 0.9
for : 15m
labels :
severity : warning
annotations :
dashboard : "http://localhost:3000/d/G7Z9GzMGz?viewPanel=99&var-instance={{ $labels.instance }}"
summary : "Persistent queue reads for instance {{ $labels.instance }} are saturated"
description : "Persistent queue reads for vmagent \"{{ $labels.job }}\" (instance {{ $labels.instance }})
are saturated by more than 90% and vmagent won't be able to keep up with reading data from the disk.
In this case, consider to decrease load on the vmagent or improve the disk throughput."
- alert : SeriesLimitHourReached
expr : (vmagent_hourly_series_limit_current_series / vmagent_hourly_series_limit_max_series) > 0.9
labels :
severity : critical
annotations :
dashboard : "http://localhost:3000/d/G7Z9GzMGz?viewPanel=88&var-instance={{ $labels.instance }}"
summary : "Instance {{ $labels.instance }} reached 90% of the limit"
description : "Max series limit set via -remoteWrite.maxHourlySeries flag is close to reaching the max value.
Then samples for new time series will be dropped instead of sending them to remote storage systems."
- alert : SeriesLimitDayReached
expr : (vmagent_daily_series_limit_current_series / vmagent_daily_series_limit_max_series) > 0.9
labels :
severity : critical
annotations :
dashboard : "http://localhost:3000/d/G7Z9GzMGz?viewPanel=90&var-instance={{ $labels.instance }}"
summary : "Instance {{ $labels.instance }} reached 90% of the limit"
description : "Max series limit set via -remoteWrite.maxDailySeries flag is close to reaching the max value.
2022-11-21 23:38:43 +01:00
Then samples for new time series will be dropped instead of sending them to remote storage systems."
- alert : ConfigurationReloadFailure
expr : |
vm_promscrape_config_last_reload_successful != 1
or
2022-11-21 23:47:40 +01:00
vmagent_relabel_config_last_reload_successful != 1
2022-11-21 23:38:43 +01:00
labels :
severity : warning
annotations :
summary : "Configuration reload failed for vmagent instance {{ $labels.instance }}"
description : "Configuration hot-reload failed for vmagent on instance {{ $labels.instance }}.
2022-11-21 23:47:40 +01:00
Check vmagent's logs for detailed error message."
2024-06-10 11:49:00 +02:00
- alert : StreamAggrFlushTimeout
expr : |
increase(vm_streamaggr_flush_timeouts_total[5m]) > 0
labels :
severity : warning
annotations :
summary : "Streaming aggregation at \"{{ $labels.job }}\" (instance {{ $labels.instance }}) can't be finished within the configured aggregation interval."
description : "Stream aggregation process can't keep up with the load and might produce incorrect aggregation results. Check logs for more details.
Possible solutions : increase aggregation interval; aggregate smaller number of series; reduce samples' ingestion rate to stream aggregation."
- alert : StreamAggrDedupFlushTimeout
expr : |
increase(vm_streamaggr_dedup_flush_timeouts_total[5m]) > 0
labels :
severity : warning
annotations :
summary : "Deduplication \"{{ $labels.job }}\" (instance {{ $labels.instance }}) can't be finished within configured deduplication interval."
description : "Deduplication process can't keep up with the load and might produce incorrect results. Check docs https://docs.victoriametrics.com/stream-aggregation/#deduplication and logs for more details.
Possible solutions : increase deduplication interval; deduplicate smaller number of series; reduce samples' ingestion rate."