mirror of
https://github.com/VictoriaMetrics/VictoriaMetrics.git
synced 2024-12-20 23:46:23 +01:00
b367a36183
The 3 alerts for VMagent:
- `RejectedRemoteWriteDataBlocksAreDropped`
- `TooManyScrapeErrors`
- `TooManyWriteErrors`
missed the description annotation.
I moved the summary to description and added a generic summary to these
alerts.
### Checklist
The following checks are **mandatory**:
- [x] My change adheres [VictoriaMetrics contributing
guidelines](https://docs.victoriametrics.com/contributing/).
Signed-off-by: Marco Maurer <marco.kilchhofer@gmail.com>
(cherry picked from commit f17fca718d
)
161 lines
8.7 KiB
YAML
161 lines
8.7 KiB
YAML
# File contains default list of alerts for vmagent service.
|
|
# The alerts below are just recommendations and may require some updates
|
|
# and threshold calibration according to every specific setup.
|
|
groups:
|
|
# Alerts group for vmagent assumes that Grafana dashboard
|
|
# https://grafana.com/grafana/dashboards/12683/ is installed.
|
|
# Pls update the `dashboard` annotation according to your setup.
|
|
- name: vmagent
|
|
interval: 30s
|
|
concurrency: 2
|
|
rules:
|
|
- alert: PersistentQueueIsDroppingData
|
|
expr: sum(increase(vm_persistentqueue_bytes_dropped_total[5m])) without (path) > 0
|
|
for: 10m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
dashboard: "http://localhost:3000/d/G7Z9GzMGz?viewPanel=49&var-instance={{ $labels.instance }}"
|
|
summary: "Instance {{ $labels.instance }} is dropping data from persistent queue"
|
|
description: "Vmagent dropped {{ $value | humanize1024 }} from persistent queue
|
|
on instance {{ $labels.instance }} for the last 10m."
|
|
|
|
- alert: RejectedRemoteWriteDataBlocksAreDropped
|
|
expr: sum(increase(vmagent_remotewrite_packets_dropped_total[5m])) without (url) > 0
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
dashboard: "http://localhost:3000/d/G7Z9GzMGz?viewPanel=79&var-instance={{ $labels.instance }}"
|
|
summary: "Vmagent is dropping data blocks that are rejected by remote storage"
|
|
description: "Job \"{{ $labels.job }}\" on instance {{ $labels.instance }} drops the rejected by
|
|
remote-write server data blocks. Check the logs to find the reason for rejects."
|
|
|
|
- alert: TooManyScrapeErrors
|
|
expr: increase(vm_promscrape_scrapes_failed_total[5m]) > 0
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
dashboard: "http://localhost:3000/d/G7Z9GzMGz?viewPanel=31&var-instance={{ $labels.instance }}"
|
|
summary: "Vmagent fails to scrape one or more targets"
|
|
description: "Job \"{{ $labels.job }}\" on instance {{ $labels.instance }} fails to scrape targets for last 15m"
|
|
|
|
- alert: TooManyWriteErrors
|
|
expr: |
|
|
(sum(increase(vm_ingestserver_request_errors_total[5m])) without (name,net,type)
|
|
+
|
|
sum(increase(vmagent_http_request_errors_total[5m])) without (path,protocol)) > 0
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
dashboard: "http://localhost:3000/d/G7Z9GzMGz?viewPanel=77&var-instance={{ $labels.instance }}"
|
|
summary: "Vmagent responds with too many errors on data ingestion protocols"
|
|
description: "Job \"{{ $labels.job }}\" on instance {{ $labels.instance }} responds with errors to write requests for last 15m."
|
|
|
|
- alert: TooManyRemoteWriteErrors
|
|
expr: rate(vmagent_remotewrite_retries_count_total[5m]) > 0
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
dashboard: "http://localhost:3000/d/G7Z9GzMGz?viewPanel=61&var-instance={{ $labels.instance }}"
|
|
summary: "Job \"{{ $labels.job }}\" on instance {{ $labels.instance }} fails to push to remote storage"
|
|
description: "Vmagent fails to push data via remote write protocol to destination \"{{ $labels.url }}\"\n
|
|
Ensure that destination is up and reachable."
|
|
|
|
- alert: RemoteWriteConnectionIsSaturated
|
|
expr: |
|
|
(
|
|
rate(vmagent_remotewrite_send_duration_seconds_total[5m])
|
|
/
|
|
vmagent_remotewrite_queues
|
|
) > 0.9
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
dashboard: "http://localhost:3000/d/G7Z9GzMGz?viewPanel=84&var-instance={{ $labels.instance }}"
|
|
summary: "Remote write connection from \"{{ $labels.job }}\" (instance {{ $labels.instance }}) to {{ $labels.url }} is saturated"
|
|
description: "The remote write connection between vmagent \"{{ $labels.job }}\" (instance {{ $labels.instance }}) and destination \"{{ $labels.url }}\"
|
|
is saturated by more than 90% and vmagent won't be able to keep up.\n
|
|
This usually means that `-remoteWrite.queues` command-line flag must be increased in order to increase
|
|
the number of connections per each remote storage."
|
|
|
|
- alert: PersistentQueueForWritesIsSaturated
|
|
expr: rate(vm_persistentqueue_write_duration_seconds_total[5m]) > 0.9
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
dashboard: "http://localhost:3000/d/G7Z9GzMGz?viewPanel=98&var-instance={{ $labels.instance }}"
|
|
summary: "Persistent queue writes for instance {{ $labels.instance }} are saturated"
|
|
description: "Persistent queue writes for vmagent \"{{ $labels.job }}\" (instance {{ $labels.instance }})
|
|
are saturated by more than 90% and vmagent won't be able to keep up with flushing data on disk.
|
|
In this case, consider to decrease load on the vmagent or improve the disk throughput."
|
|
|
|
- alert: PersistentQueueForReadsIsSaturated
|
|
expr: rate(vm_persistentqueue_read_duration_seconds_total[5m]) > 0.9
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
dashboard: "http://localhost:3000/d/G7Z9GzMGz?viewPanel=99&var-instance={{ $labels.instance }}"
|
|
summary: "Persistent queue reads for instance {{ $labels.instance }} are saturated"
|
|
description: "Persistent queue reads for vmagent \"{{ $labels.job }}\" (instance {{ $labels.instance }})
|
|
are saturated by more than 90% and vmagent won't be able to keep up with reading data from the disk.
|
|
In this case, consider to decrease load on the vmagent or improve the disk throughput."
|
|
|
|
- alert: SeriesLimitHourReached
|
|
expr: (vmagent_hourly_series_limit_current_series / vmagent_hourly_series_limit_max_series) > 0.9
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
dashboard: "http://localhost:3000/d/G7Z9GzMGz?viewPanel=88&var-instance={{ $labels.instance }}"
|
|
summary: "Instance {{ $labels.instance }} reached 90% of the limit"
|
|
description: "Max series limit set via -remoteWrite.maxHourlySeries flag is close to reaching the max value.
|
|
Then samples for new time series will be dropped instead of sending them to remote storage systems."
|
|
|
|
- alert: SeriesLimitDayReached
|
|
expr: (vmagent_daily_series_limit_current_series / vmagent_daily_series_limit_max_series) > 0.9
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
dashboard: "http://localhost:3000/d/G7Z9GzMGz?viewPanel=90&var-instance={{ $labels.instance }}"
|
|
summary: "Instance {{ $labels.instance }} reached 90% of the limit"
|
|
description: "Max series limit set via -remoteWrite.maxDailySeries flag is close to reaching the max value.
|
|
Then samples for new time series will be dropped instead of sending them to remote storage systems."
|
|
|
|
- alert: ConfigurationReloadFailure
|
|
expr: |
|
|
vm_promscrape_config_last_reload_successful != 1
|
|
or
|
|
vmagent_relabel_config_last_reload_successful != 1
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Configuration reload failed for vmagent instance {{ $labels.instance }}"
|
|
description: "Configuration hot-reload failed for vmagent on instance {{ $labels.instance }}.
|
|
Check vmagent's logs for detailed error message."
|
|
|
|
- alert: StreamAggrFlushTimeout
|
|
expr: |
|
|
increase(vm_streamaggr_flush_timeouts_total[5m]) > 0
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Streaming aggregation at \"{{ $labels.job }}\" (instance {{ $labels.instance }}) can't be finished within the configured aggregation interval."
|
|
description: "Stream aggregation process can't keep up with the load and might produce incorrect aggregation results. Check logs for more details.
|
|
Possible solutions: increase aggregation interval; aggregate smaller number of series; reduce samples' ingestion rate to stream aggregation."
|
|
|
|
- alert: StreamAggrDedupFlushTimeout
|
|
expr: |
|
|
increase(vm_streamaggr_dedup_flush_timeouts_total[5m]) > 0
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Deduplication \"{{ $labels.job }}\" (instance {{ $labels.instance }}) can't be finished within configured deduplication interval."
|
|
description: "Deduplication process can't keep up with the load and might produce incorrect results. Check docs https://docs.victoriametrics.com/stream-aggregation/#deduplication and logs for more details.
|
|
Possible solutions: increase deduplication interval; deduplicate smaller number of series; reduce samples' ingestion rate."
|