mirror of
https://github.com/VictoriaMetrics/VictoriaMetrics.git
synced 2024-12-13 13:11:37 +01:00
231f6c7fb2
Aggregations with by() have one sideeffect, that any custom labels you add to hosts are dropped too which can be used for alerts routing.
Therefore, some good practice could be to use without() instead, with labels, like without(path) , or without(url) to get same aggregations but with any external labels left intact.
(cherry picked from commit 75196d7234
)
138 lines
7.2 KiB
YAML
138 lines
7.2 KiB
YAML
# File contains default list of alerts for vmagent service.
|
|
# The alerts below are just recommendations and may require some updates
|
|
# and threshold calibration according to every specific setup.
|
|
groups:
|
|
# Alerts group for vmagent assumes that Grafana dashboard
|
|
# https://grafana.com/grafana/dashboards/12683-victoriametrics-vmagent/ is installed.
|
|
# Pls update the `dashboard` annotation according to your setup.
|
|
- name: vmagent
|
|
interval: 30s
|
|
concurrency: 2
|
|
rules:
|
|
- alert: PersistentQueueIsDroppingData
|
|
expr: sum(increase(vm_persistentqueue_bytes_dropped_total[5m])) without (path) > 0
|
|
for: 10m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
dashboard: "http://localhost:3000/d/G7Z9GzMGz?viewPanel=49&var-instance={{ $labels.instance }}"
|
|
summary: "Instance {{ $labels.instance }} is dropping data from persistent queue"
|
|
description: "Vmagent dropped {{ $value | humanize1024 }} from persistent queue
|
|
on instance {{ $labels.instance }} for the last 10m."
|
|
|
|
- alert: RejectedRemoteWriteDataBlocksAreDropped
|
|
expr: sum(increase(vmagent_remotewrite_packets_dropped_total[5m])) without (url) > 0
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
dashboard: "http://localhost:3000/d/G7Z9GzMGz?viewPanel=79&var-instance={{ $labels.instance }}"
|
|
summary: "Job \"{{ $labels.job }}\" on instance {{ $labels.instance }} drops the rejected by
|
|
remote-write server data blocks. Check the logs to find the reason for rejects."
|
|
|
|
- alert: TooManyScrapeErrors
|
|
expr: increase(vm_promscrape_scrapes_failed_total[5m]) > 0
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
dashboard: "http://localhost:3000/d/G7Z9GzMGz?viewPanel=31&var-instance={{ $labels.instance }}"
|
|
summary: "Job \"{{ $labels.job }}\" on instance {{ $labels.instance }} fails to scrape targets for last 15m"
|
|
|
|
- alert: TooManyWriteErrors
|
|
expr: |
|
|
(sum(increase(vm_ingestserver_request_errors_total[5m])) without (name,net,type)
|
|
+
|
|
sum(increase(vmagent_http_request_errors_total[5m])) without (path,protocol)) > 0
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
dashboard: "http://localhost:3000/d/G7Z9GzMGz?viewPanel=77&var-instance={{ $labels.instance }}"
|
|
summary: "Job \"{{ $labels.job }}\" on instance {{ $labels.instance }} responds with errors to write requests for last 15m."
|
|
|
|
- alert: TooManyRemoteWriteErrors
|
|
expr: rate(vmagent_remotewrite_retries_count_total[5m]) > 0
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
dashboard: "http://localhost:3000/d/G7Z9GzMGz?viewPanel=61&var-instance={{ $labels.instance }}"
|
|
summary: "Job \"{{ $labels.job }}\" on instance {{ $labels.instance }} fails to push to remote storage"
|
|
description: "Vmagent fails to push data via remote write protocol to destination \"{{ $labels.url }}\"\n
|
|
Ensure that destination is up and reachable."
|
|
|
|
- alert: RemoteWriteConnectionIsSaturated
|
|
expr: |
|
|
(
|
|
rate(vmagent_remotewrite_send_duration_seconds_total[5m])
|
|
/
|
|
vmagent_remotewrite_queues
|
|
) > 0.9
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
dashboard: "http://localhost:3000/d/G7Z9GzMGz?viewPanel=84&var-instance={{ $labels.instance }}"
|
|
summary: "Remote write connection from \"{{ $labels.job }}\" (instance {{ $labels.instance }}) to {{ $labels.url }} is saturated"
|
|
description: "The remote write connection between vmagent \"{{ $labels.job }}\" (instance {{ $labels.instance }}) and destination \"{{ $labels.url }}\"
|
|
is saturated by more than 90% and vmagent won't be able to keep up.\n
|
|
This usually means that `-remoteWrite.queues` command-line flag must be increased in order to increase
|
|
the number of connections per each remote storage."
|
|
|
|
- alert: PersistentQueueForWritesIsSaturated
|
|
expr: rate(vm_persistentqueue_write_duration_seconds_total[5m]) > 0.9
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
dashboard: "http://localhost:3000/d/G7Z9GzMGz?viewPanel=98&var-instance={{ $labels.instance }}"
|
|
summary: "Persistent queue writes for instance {{ $labels.instance }} are saturated"
|
|
description: "Persistent queue writes for vmagent \"{{ $labels.job }}\" (instance {{ $labels.instance }})
|
|
are saturated by more than 90% and vmagent won't be able to keep up with flushing data on disk.
|
|
In this case, consider to decrease load on the vmagent or improve the disk throughput."
|
|
|
|
- alert: PersistentQueueForReadsIsSaturated
|
|
expr: rate(vm_persistentqueue_read_duration_seconds_total[5m]) > 0.9
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
dashboard: "http://localhost:3000/d/G7Z9GzMGz?viewPanel=99&var-instance={{ $labels.instance }}"
|
|
summary: "Persistent queue reads for instance {{ $labels.instance }} are saturated"
|
|
description: "Persistent queue reads for vmagent \"{{ $labels.job }}\" (instance {{ $labels.instance }})
|
|
are saturated by more than 90% and vmagent won't be able to keep up with reading data from the disk.
|
|
In this case, consider to decrease load on the vmagent or improve the disk throughput."
|
|
|
|
- alert: SeriesLimitHourReached
|
|
expr: (vmagent_hourly_series_limit_current_series / vmagent_hourly_series_limit_max_series) > 0.9
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
dashboard: "http://localhost:3000/d/G7Z9GzMGz?viewPanel=88&var-instance={{ $labels.instance }}"
|
|
summary: "Instance {{ $labels.instance }} reached 90% of the limit"
|
|
description: "Max series limit set via -remoteWrite.maxHourlySeries flag is close to reaching the max value.
|
|
Then samples for new time series will be dropped instead of sending them to remote storage systems."
|
|
|
|
- alert: SeriesLimitDayReached
|
|
expr: (vmagent_daily_series_limit_current_series / vmagent_daily_series_limit_max_series) > 0.9
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
dashboard: "http://localhost:3000/d/G7Z9GzMGz?viewPanel=90&var-instance={{ $labels.instance }}"
|
|
summary: "Instance {{ $labels.instance }} reached 90% of the limit"
|
|
description: "Max series limit set via -remoteWrite.maxDailySeries flag is close to reaching the max value.
|
|
Then samples for new time series will be dropped instead of sending them to remote storage systems."
|
|
|
|
- alert: ConfigurationReloadFailure
|
|
expr: |
|
|
vm_promscrape_config_last_reload_successful != 1
|
|
or
|
|
vmagent_relabel_config_last_reload_successful != 1
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Configuration reload failed for vmagent instance {{ $labels.instance }}"
|
|
description: "Configuration hot-reload failed for vmagent on instance {{ $labels.instance }}.
|
|
Check vmagent's logs for detailed error message."
|