VictoriaMetrics/deployment/docker/alerts-vmagent.yml
Aliaksandr Valialkin 859977d591
Revert "lib/promscrape: add metric vm_promscrape_scrapes_skipped_total (#5074)"
This reverts commit 74301cdbf5.

Reason for revert:

vmagent already provides better approach for detecting slow scrape targets via the following query:

    scrape_duration_seconds / scrape_timeout_seconds > 1

This query depends on automatically generated per-target metrics.
See https://docs.victoriametrics.com/vmagent.html#automatically-generated-metrics for more details.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5074
2023-10-02 20:59:56 +02:00

135 lines
7.3 KiB
YAML

# File contains default list of alerts for vmagent service.
# The alerts below are just recommendations and may require some updates
# and threshold calibration according to every specific setup.
groups:
# Alerts group for vmagent assumes that Grafana dashboard
# https://grafana.com/grafana/dashboards/12683-victoriametrics-vmagent/ is installed.
# Pls update the `dashboard` annotation according to your setup.
- name: vmagent
interval: 30s
concurrency: 2
rules:
- alert: PersistentQueueIsDroppingData
expr: sum(increase(vm_persistentqueue_bytes_dropped_total[5m])) by (job, instance) > 0
for: 10m
labels:
severity: critical
annotations:
dashboard: "http://localhost:3000/d/G7Z9GzMGz?viewPanel=49&var-instance={{ $labels.instance }}"
summary: "Instance {{ $labels.instance }} is dropping data from persistent queue"
description: "Vmagent dropped {{ $value | humanize1024 }} from persistent queue
on instance {{ $labels.instance }} for the last 10m."
- alert: RejectedRemoteWriteDataBlocksAreDropped
expr: sum(increase(vmagent_remotewrite_packets_dropped_total[5m])) by (job, instance) > 0
for: 15m
labels:
severity: warning
annotations:
dashboard: "http://localhost:3000/d/G7Z9GzMGz?viewPanel=79&var-instance={{ $labels.instance }}"
summary: "Job \"{{ $labels.job }}\" on instance {{ $labels.instance }} drops the rejected by
remote-write server data blocks. Check the logs to find the reason for rejects."
- alert: TooManyScrapeErrors
expr: sum(increase(vm_promscrape_scrapes_failed_total[5m])) by (job, instance) > 0
for: 15m
labels:
severity: warning
annotations:
dashboard: "http://localhost:3000/d/G7Z9GzMGz?viewPanel=31&var-instance={{ $labels.instance }}"
summary: "Job \"{{ $labels.job }}\" on instance {{ $labels.instance }} fails to scrape targets for last 15m"
- alert: TooManyWriteErrors
expr: |
(sum(increase(vm_ingestserver_request_errors_total[5m])) by (job, instance)
+
sum(increase(vmagent_http_request_errors_total[5m])) by (job, instance)) > 0
for: 15m
labels:
severity: warning
annotations:
dashboard: "http://localhost:3000/d/G7Z9GzMGz?viewPanel=77&var-instance={{ $labels.instance }}"
summary: "Job \"{{ $labels.job }}\" on instance {{ $labels.instance }} responds with errors to write requests for last 15m."
- alert: TooManyRemoteWriteErrors
expr: sum(rate(vmagent_remotewrite_retries_count_total[5m])) by(job, instance, url) > 0
for: 15m
labels:
severity: warning
annotations:
dashboard: "http://localhost:3000/d/G7Z9GzMGz?viewPanel=61&var-instance={{ $labels.instance }}"
summary: "Job \"{{ $labels.job }}\" on instance {{ $labels.instance }} fails to push to remote storage"
description: "Vmagent fails to push data via remote write protocol to destination \"{{ $labels.url }}\"\n
Ensure that destination is up and reachable."
- alert: RemoteWriteConnectionIsSaturated
expr: |
sum(rate(vmagent_remotewrite_send_duration_seconds_total[5m])) by(job, instance, url)
> 0.9 * max(vmagent_remotewrite_queues) by(job, instance, url)
for: 15m
labels:
severity: warning
annotations:
dashboard: "http://localhost:3000/d/G7Z9GzMGz?viewPanel=84&var-instance={{ $labels.instance }}"
summary: "Remote write connection from \"{{ $labels.job }}\" (instance {{ $labels.instance }}) to {{ $labels.url }} is saturated"
description: "The remote write connection between vmagent \"{{ $labels.job }}\" (instance {{ $labels.instance }}) and destination \"{{ $labels.url }}\"
is saturated by more than 90% and vmagent won't be able to keep up.\n
This usually means that `-remoteWrite.queues` command-line flag must be increased in order to increase
the number of connections per each remote storage."
- alert: PersistentQueueForWritesIsSaturated
expr: rate(vm_persistentqueue_write_duration_seconds_total[5m]) > 0.9
for: 15m
labels:
severity: warning
annotations:
dashboard: "http://localhost:3000/d/G7Z9GzMGz?viewPanel=98&var-instance={{ $labels.instance }}"
summary: "Persistent queue writes for instance {{ $labels.instance }} are saturated"
description: "Persistent queue writes for vmagent \"{{ $labels.job }}\" (instance {{ $labels.instance }})
are saturated by more than 90% and vmagent won't be able to keep up with flushing data on disk.
In this case, consider to decrease load on the vmagent or improve the disk throughput."
- alert: PersistentQueueForReadsIsSaturated
expr: rate(vm_persistentqueue_read_duration_seconds_total[5m]) > 0.9
for: 15m
labels:
severity: warning
annotations:
dashboard: "http://localhost:3000/d/G7Z9GzMGz?viewPanel=99&var-instance={{ $labels.instance }}"
summary: "Persistent queue reads for instance {{ $labels.instance }} are saturated"
description: "Persistent queue reads for vmagent \"{{ $labels.job }}\" (instance {{ $labels.instance }})
are saturated by more than 90% and vmagent won't be able to keep up with reading data from the disk.
In this case, consider to decrease load on the vmagent or improve the disk throughput."
- alert: SeriesLimitHourReached
expr: (vmagent_hourly_series_limit_current_series / vmagent_hourly_series_limit_max_series) > 0.9
labels:
severity: critical
annotations:
dashboard: "http://localhost:3000/d/G7Z9GzMGz?viewPanel=88&var-instance={{ $labels.instance }}"
summary: "Instance {{ $labels.instance }} reached 90% of the limit"
description: "Max series limit set via -remoteWrite.maxHourlySeries flag is close to reaching the max value.
Then samples for new time series will be dropped instead of sending them to remote storage systems."
- alert: SeriesLimitDayReached
expr: (vmagent_daily_series_limit_current_series / vmagent_daily_series_limit_max_series) > 0.9
labels:
severity: critical
annotations:
dashboard: "http://localhost:3000/d/G7Z9GzMGz?viewPanel=90&var-instance={{ $labels.instance }}"
summary: "Instance {{ $labels.instance }} reached 90% of the limit"
description: "Max series limit set via -remoteWrite.maxDailySeries flag is close to reaching the max value.
Then samples for new time series will be dropped instead of sending them to remote storage systems."
- alert: ConfigurationReloadFailure
expr: |
vm_promscrape_config_last_reload_successful != 1
or
vmagent_relabel_config_last_reload_successful != 1
labels:
severity: warning
annotations:
summary: "Configuration reload failed for vmagent instance {{ $labels.instance }}"
description: "Configuration hot-reload failed for vmagent on instance {{ $labels.instance }}.
Check vmagent's logs for detailed error message."