VictoriaMetrics/deployment/docker/alerts-vmagent.yml
7840vz 231f6c7fb2
alerts: inverse grouping in vmagent alerts (#5429)
Aggregations with by() have one sideeffect, that any custom labels you add to hosts are dropped too which can be used for alerts routing.

Therefore, some good practice could be to use without() instead, with labels, like without(path) , or without(url) to get same aggregations but with any external labels left intact.

(cherry picked from commit 75196d7234)
2023-12-11 15:38:14 +01:00

138 lines
7.2 KiB
YAML

# File contains default list of alerts for vmagent service.
# The alerts below are just recommendations and may require some updates
# and threshold calibration according to every specific setup.
groups:
# Alerts group for vmagent assumes that Grafana dashboard
# https://grafana.com/grafana/dashboards/12683-victoriametrics-vmagent/ is installed.
# Pls update the `dashboard` annotation according to your setup.
- name: vmagent
interval: 30s
concurrency: 2
rules:
- alert: PersistentQueueIsDroppingData
expr: sum(increase(vm_persistentqueue_bytes_dropped_total[5m])) without (path) > 0
for: 10m
labels:
severity: critical
annotations:
dashboard: "http://localhost:3000/d/G7Z9GzMGz?viewPanel=49&var-instance={{ $labels.instance }}"
summary: "Instance {{ $labels.instance }} is dropping data from persistent queue"
description: "Vmagent dropped {{ $value | humanize1024 }} from persistent queue
on instance {{ $labels.instance }} for the last 10m."
- alert: RejectedRemoteWriteDataBlocksAreDropped
expr: sum(increase(vmagent_remotewrite_packets_dropped_total[5m])) without (url) > 0
for: 15m
labels:
severity: warning
annotations:
dashboard: "http://localhost:3000/d/G7Z9GzMGz?viewPanel=79&var-instance={{ $labels.instance }}"
summary: "Job \"{{ $labels.job }}\" on instance {{ $labels.instance }} drops the rejected by
remote-write server data blocks. Check the logs to find the reason for rejects."
- alert: TooManyScrapeErrors
expr: increase(vm_promscrape_scrapes_failed_total[5m]) > 0
for: 15m
labels:
severity: warning
annotations:
dashboard: "http://localhost:3000/d/G7Z9GzMGz?viewPanel=31&var-instance={{ $labels.instance }}"
summary: "Job \"{{ $labels.job }}\" on instance {{ $labels.instance }} fails to scrape targets for last 15m"
- alert: TooManyWriteErrors
expr: |
(sum(increase(vm_ingestserver_request_errors_total[5m])) without (name,net,type)
+
sum(increase(vmagent_http_request_errors_total[5m])) without (path,protocol)) > 0
for: 15m
labels:
severity: warning
annotations:
dashboard: "http://localhost:3000/d/G7Z9GzMGz?viewPanel=77&var-instance={{ $labels.instance }}"
summary: "Job \"{{ $labels.job }}\" on instance {{ $labels.instance }} responds with errors to write requests for last 15m."
- alert: TooManyRemoteWriteErrors
expr: rate(vmagent_remotewrite_retries_count_total[5m]) > 0
for: 15m
labels:
severity: warning
annotations:
dashboard: "http://localhost:3000/d/G7Z9GzMGz?viewPanel=61&var-instance={{ $labels.instance }}"
summary: "Job \"{{ $labels.job }}\" on instance {{ $labels.instance }} fails to push to remote storage"
description: "Vmagent fails to push data via remote write protocol to destination \"{{ $labels.url }}\"\n
Ensure that destination is up and reachable."
- alert: RemoteWriteConnectionIsSaturated
expr: |
(
rate(vmagent_remotewrite_send_duration_seconds_total[5m])
/
vmagent_remotewrite_queues
) > 0.9
for: 15m
labels:
severity: warning
annotations:
dashboard: "http://localhost:3000/d/G7Z9GzMGz?viewPanel=84&var-instance={{ $labels.instance }}"
summary: "Remote write connection from \"{{ $labels.job }}\" (instance {{ $labels.instance }}) to {{ $labels.url }} is saturated"
description: "The remote write connection between vmagent \"{{ $labels.job }}\" (instance {{ $labels.instance }}) and destination \"{{ $labels.url }}\"
is saturated by more than 90% and vmagent won't be able to keep up.\n
This usually means that `-remoteWrite.queues` command-line flag must be increased in order to increase
the number of connections per each remote storage."
- alert: PersistentQueueForWritesIsSaturated
expr: rate(vm_persistentqueue_write_duration_seconds_total[5m]) > 0.9
for: 15m
labels:
severity: warning
annotations:
dashboard: "http://localhost:3000/d/G7Z9GzMGz?viewPanel=98&var-instance={{ $labels.instance }}"
summary: "Persistent queue writes for instance {{ $labels.instance }} are saturated"
description: "Persistent queue writes for vmagent \"{{ $labels.job }}\" (instance {{ $labels.instance }})
are saturated by more than 90% and vmagent won't be able to keep up with flushing data on disk.
In this case, consider to decrease load on the vmagent or improve the disk throughput."
- alert: PersistentQueueForReadsIsSaturated
expr: rate(vm_persistentqueue_read_duration_seconds_total[5m]) > 0.9
for: 15m
labels:
severity: warning
annotations:
dashboard: "http://localhost:3000/d/G7Z9GzMGz?viewPanel=99&var-instance={{ $labels.instance }}"
summary: "Persistent queue reads for instance {{ $labels.instance }} are saturated"
description: "Persistent queue reads for vmagent \"{{ $labels.job }}\" (instance {{ $labels.instance }})
are saturated by more than 90% and vmagent won't be able to keep up with reading data from the disk.
In this case, consider to decrease load on the vmagent or improve the disk throughput."
- alert: SeriesLimitHourReached
expr: (vmagent_hourly_series_limit_current_series / vmagent_hourly_series_limit_max_series) > 0.9
labels:
severity: critical
annotations:
dashboard: "http://localhost:3000/d/G7Z9GzMGz?viewPanel=88&var-instance={{ $labels.instance }}"
summary: "Instance {{ $labels.instance }} reached 90% of the limit"
description: "Max series limit set via -remoteWrite.maxHourlySeries flag is close to reaching the max value.
Then samples for new time series will be dropped instead of sending them to remote storage systems."
- alert: SeriesLimitDayReached
expr: (vmagent_daily_series_limit_current_series / vmagent_daily_series_limit_max_series) > 0.9
labels:
severity: critical
annotations:
dashboard: "http://localhost:3000/d/G7Z9GzMGz?viewPanel=90&var-instance={{ $labels.instance }}"
summary: "Instance {{ $labels.instance }} reached 90% of the limit"
description: "Max series limit set via -remoteWrite.maxDailySeries flag is close to reaching the max value.
Then samples for new time series will be dropped instead of sending them to remote storage systems."
- alert: ConfigurationReloadFailure
expr: |
vm_promscrape_config_last_reload_successful != 1
or
vmagent_relabel_config_last_reload_successful != 1
labels:
severity: warning
annotations:
summary: "Configuration reload failed for vmagent instance {{ $labels.instance }}"
description: "Configuration hot-reload failed for vmagent on instance {{ $labels.instance }}.
Check vmagent's logs for detailed error message."