# File contains default list of alerts for vmagent service. # The alerts below are just recommendations and may require some updates # and threshold calibration according to every specific setup. groups: # Alerts group for vmagent assumes that Grafana dashboard # https://grafana.com/grafana/dashboards/12683/ is installed. # Pls update the `dashboard` annotation according to your setup. - name: vmagent interval: 30s concurrency: 2 rules: - alert: PersistentQueueIsDroppingData expr: sum(increase(vm_persistentqueue_bytes_dropped_total[5m])) without (path) > 0 for: 10m labels: severity: critical annotations: dashboard: "http://localhost:3000/d/G7Z9GzMGz?viewPanel=49&var-instance={{ $labels.instance }}" summary: "Instance {{ $labels.instance }} is dropping data from persistent queue" description: "Vmagent dropped {{ $value | humanize1024 }} from persistent queue on instance {{ $labels.instance }} for the last 10m." - alert: RejectedRemoteWriteDataBlocksAreDropped expr: sum(increase(vmagent_remotewrite_packets_dropped_total[5m])) without (url) > 0 for: 15m labels: severity: warning annotations: dashboard: "http://localhost:3000/d/G7Z9GzMGz?viewPanel=79&var-instance={{ $labels.instance }}" summary: "Job \"{{ $labels.job }}\" on instance {{ $labels.instance }} drops the rejected by remote-write server data blocks. Check the logs to find the reason for rejects." - alert: TooManyScrapeErrors expr: increase(vm_promscrape_scrapes_failed_total[5m]) > 0 for: 15m labels: severity: warning annotations: dashboard: "http://localhost:3000/d/G7Z9GzMGz?viewPanel=31&var-instance={{ $labels.instance }}" summary: "Job \"{{ $labels.job }}\" on instance {{ $labels.instance }} fails to scrape targets for last 15m" - alert: TooManyWriteErrors expr: | (sum(increase(vm_ingestserver_request_errors_total[5m])) without (name,net,type) + sum(increase(vmagent_http_request_errors_total[5m])) without (path,protocol)) > 0 for: 15m labels: severity: warning annotations: dashboard: "http://localhost:3000/d/G7Z9GzMGz?viewPanel=77&var-instance={{ $labels.instance }}" summary: "Job \"{{ $labels.job }}\" on instance {{ $labels.instance }} responds with errors to write requests for last 15m." - alert: TooManyRemoteWriteErrors expr: rate(vmagent_remotewrite_retries_count_total[5m]) > 0 for: 15m labels: severity: warning annotations: dashboard: "http://localhost:3000/d/G7Z9GzMGz?viewPanel=61&var-instance={{ $labels.instance }}" summary: "Job \"{{ $labels.job }}\" on instance {{ $labels.instance }} fails to push to remote storage" description: "Vmagent fails to push data via remote write protocol to destination \"{{ $labels.url }}\"\n Ensure that destination is up and reachable." - alert: RemoteWriteConnectionIsSaturated expr: | ( rate(vmagent_remotewrite_send_duration_seconds_total[5m]) / vmagent_remotewrite_queues ) > 0.9 for: 15m labels: severity: warning annotations: dashboard: "http://localhost:3000/d/G7Z9GzMGz?viewPanel=84&var-instance={{ $labels.instance }}" summary: "Remote write connection from \"{{ $labels.job }}\" (instance {{ $labels.instance }}) to {{ $labels.url }} is saturated" description: "The remote write connection between vmagent \"{{ $labels.job }}\" (instance {{ $labels.instance }}) and destination \"{{ $labels.url }}\" is saturated by more than 90% and vmagent won't be able to keep up.\n This usually means that `-remoteWrite.queues` command-line flag must be increased in order to increase the number of connections per each remote storage." - alert: PersistentQueueForWritesIsSaturated expr: rate(vm_persistentqueue_write_duration_seconds_total[5m]) > 0.9 for: 15m labels: severity: warning annotations: dashboard: "http://localhost:3000/d/G7Z9GzMGz?viewPanel=98&var-instance={{ $labels.instance }}" summary: "Persistent queue writes for instance {{ $labels.instance }} are saturated" description: "Persistent queue writes for vmagent \"{{ $labels.job }}\" (instance {{ $labels.instance }}) are saturated by more than 90% and vmagent won't be able to keep up with flushing data on disk. In this case, consider to decrease load on the vmagent or improve the disk throughput." - alert: PersistentQueueForReadsIsSaturated expr: rate(vm_persistentqueue_read_duration_seconds_total[5m]) > 0.9 for: 15m labels: severity: warning annotations: dashboard: "http://localhost:3000/d/G7Z9GzMGz?viewPanel=99&var-instance={{ $labels.instance }}" summary: "Persistent queue reads for instance {{ $labels.instance }} are saturated" description: "Persistent queue reads for vmagent \"{{ $labels.job }}\" (instance {{ $labels.instance }}) are saturated by more than 90% and vmagent won't be able to keep up with reading data from the disk. In this case, consider to decrease load on the vmagent or improve the disk throughput." - alert: SeriesLimitHourReached expr: (vmagent_hourly_series_limit_current_series / vmagent_hourly_series_limit_max_series) > 0.9 labels: severity: critical annotations: dashboard: "http://localhost:3000/d/G7Z9GzMGz?viewPanel=88&var-instance={{ $labels.instance }}" summary: "Instance {{ $labels.instance }} reached 90% of the limit" description: "Max series limit set via -remoteWrite.maxHourlySeries flag is close to reaching the max value. Then samples for new time series will be dropped instead of sending them to remote storage systems." - alert: SeriesLimitDayReached expr: (vmagent_daily_series_limit_current_series / vmagent_daily_series_limit_max_series) > 0.9 labels: severity: critical annotations: dashboard: "http://localhost:3000/d/G7Z9GzMGz?viewPanel=90&var-instance={{ $labels.instance }}" summary: "Instance {{ $labels.instance }} reached 90% of the limit" description: "Max series limit set via -remoteWrite.maxDailySeries flag is close to reaching the max value. Then samples for new time series will be dropped instead of sending them to remote storage systems." - alert: ConfigurationReloadFailure expr: | vm_promscrape_config_last_reload_successful != 1 or vmagent_relabel_config_last_reload_successful != 1 labels: severity: warning annotations: summary: "Configuration reload failed for vmagent instance {{ $labels.instance }}" description: "Configuration hot-reload failed for vmagent on instance {{ $labels.instance }}. Check vmagent's logs for detailed error message." - alert: StreamAggrFlushTimeout expr: | increase(vm_streamaggr_flush_timeouts_total[5m]) > 0 labels: severity: warning annotations: summary: "Streaming aggregation at \"{{ $labels.job }}\" (instance {{ $labels.instance }}) can't be finished within the configured aggregation interval." description: "Stream aggregation process can't keep up with the load and might produce incorrect aggregation results. Check logs for more details. Possible solutions: increase aggregation interval; aggregate smaller number of series; reduce samples' ingestion rate to stream aggregation." - alert: StreamAggrDedupFlushTimeout expr: | increase(vm_streamaggr_dedup_flush_timeouts_total[5m]) > 0 labels: severity: warning annotations: summary: "Deduplication \"{{ $labels.job }}\" (instance {{ $labels.instance }}) can't be finished within configured deduplication interval." description: "Deduplication process can't keep up with the load and might produce incorrect results. Check docs https://docs.victoriametrics.com/stream-aggregation/#deduplication and logs for more details. Possible solutions: increase deduplication interval; deduplicate smaller number of series; reduce samples' ingestion rate."