2022-09-21 10:48:38 +02:00
|
|
|
# File contains default list of alerts for VictoriaMetrics single server.
|
2021-01-11 12:03:15 +01:00
|
|
|
# The alerts below are just recommendations and may require some updates
|
|
|
|
# and threshold calibration according to every specific setup.
|
2020-11-07 16:00:23 +01:00
|
|
|
groups:
|
2021-01-11 12:03:15 +01:00
|
|
|
# Alerts group for VM single assumes that Grafana dashboard
|
|
|
|
# https://grafana.com/grafana/dashboards/10229 is installed.
|
|
|
|
# Pls update the `dashboard` annotation according to your setup.
|
|
|
|
- name: vmsingle
|
|
|
|
interval: 30s
|
|
|
|
concurrency: 2
|
2020-11-07 16:00:23 +01:00
|
|
|
rules:
|
2021-01-11 12:03:15 +01:00
|
|
|
- alert: DiskRunsOutOfSpaceIn3Days
|
|
|
|
expr: |
|
2021-07-07 09:31:09 +02:00
|
|
|
vm_free_disk_space_bytes / ignoring(path)
|
|
|
|
(
|
2024-02-08 09:43:39 +01:00
|
|
|
rate(vm_rows_added_to_storage_total[1d])
|
2021-07-07 09:31:09 +02:00
|
|
|
* scalar(
|
2022-12-06 00:59:52 +01:00
|
|
|
sum(vm_data_size_bytes{type!~"indexdb.*"}) /
|
|
|
|
sum(vm_rows{type!~"indexdb.*"})
|
2021-07-07 09:31:09 +02:00
|
|
|
)
|
2022-07-12 18:52:59 +02:00
|
|
|
) < 3 * 24 * 3600 > 0
|
2021-01-11 12:03:15 +01:00
|
|
|
for: 30m
|
|
|
|
labels:
|
|
|
|
severity: critical
|
|
|
|
annotations:
|
|
|
|
dashboard: "http://localhost:3000/d/wNf0q_kZk?viewPanel=73&var-instance={{ $labels.instance }}"
|
|
|
|
summary: "Instance {{ $labels.instance }} will run out of disk space soon"
|
|
|
|
description: "Taking into account current ingestion rate, free disk space will be enough only
|
|
|
|
for {{ $value | humanizeDuration }} on instance {{ $labels.instance }}.\n
|
|
|
|
Consider to limit the ingestion rate, decrease retention or scale the disk space if possible."
|
|
|
|
|
2021-04-08 17:24:25 +02:00
|
|
|
- alert: DiskRunsOutOfSpace
|
|
|
|
expr: |
|
2024-01-16 09:49:39 +01:00
|
|
|
sum(vm_data_size_bytes) by(job, instance) /
|
2021-04-08 17:24:25 +02:00
|
|
|
(
|
2024-01-16 09:49:39 +01:00
|
|
|
sum(vm_free_disk_space_bytes) by(job, instance) +
|
|
|
|
sum(vm_data_size_bytes) by(job, instance)
|
2021-04-08 17:24:25 +02:00
|
|
|
) > 0.8
|
|
|
|
for: 30m
|
|
|
|
labels:
|
|
|
|
severity: critical
|
|
|
|
annotations:
|
|
|
|
dashboard: "http://localhost:3000/d/wNf0q_kZk?viewPanel=53&var-instance={{ $labels.instance }}"
|
2024-01-16 09:49:39 +01:00
|
|
|
summary: "Instance {{ $labels.instance }} (job={{ $labels.job }}) will run out of disk space soon"
|
2021-04-08 17:24:25 +02:00
|
|
|
description: "Disk utilisation on instance {{ $labels.instance }} is more than 80%.\n
|
2024-04-30 10:24:31 +02:00
|
|
|
Having less than 20% of free disk space could cripple merge processes and overall performance.
|
2021-04-08 17:24:25 +02:00
|
|
|
Consider to limit the ingestion rate, decrease retention or scale the disk space if possible."
|
|
|
|
|
2021-01-11 12:03:15 +01:00
|
|
|
- alert: RequestErrorsToAPI
|
|
|
|
expr: increase(vm_http_request_errors_total[5m]) > 0
|
|
|
|
for: 15m
|
|
|
|
labels:
|
|
|
|
severity: warning
|
|
|
|
annotations:
|
|
|
|
dashboard: "http://localhost:3000/d/wNf0q_kZk?viewPanel=35&var-instance={{ $labels.instance }}"
|
|
|
|
summary: "Too many errors served for path {{ $labels.path }} (instance {{ $labels.instance }})"
|
|
|
|
description: "Requests to path {{ $labels.path }} are receiving errors.
|
|
|
|
Please verify if clients are sending correct requests."
|
|
|
|
|
|
|
|
- alert: RowsRejectedOnIngestion
|
2023-12-11 15:17:30 +01:00
|
|
|
expr: rate(vm_rows_ignored_total[5m]) > 0
|
2021-01-11 12:03:15 +01:00
|
|
|
for: 15m
|
|
|
|
labels:
|
|
|
|
severity: warning
|
|
|
|
annotations:
|
|
|
|
dashboard: "http://localhost:3000/d/wNf0q_kZk?viewPanel=58&var-instance={{ $labels.instance }}"
|
|
|
|
summary: "Some rows are rejected on \"{{ $labels.instance }}\" on ingestion attempt"
|
|
|
|
description: "VM is rejecting to ingest rows on \"{{ $labels.instance }}\" due to the
|
|
|
|
following reason: \"{{ $labels.reason }}\""
|
|
|
|
|
|
|
|
- alert: TooHighChurnRate
|
|
|
|
expr: |
|
|
|
|
(
|
|
|
|
sum(rate(vm_new_timeseries_created_total[5m])) by(instance)
|
|
|
|
/
|
|
|
|
sum(rate(vm_rows_inserted_total[5m])) by (instance)
|
|
|
|
) > 0.1
|
|
|
|
for: 15m
|
|
|
|
labels:
|
|
|
|
severity: warning
|
|
|
|
annotations:
|
|
|
|
dashboard: "http://localhost:3000/d/wNf0q_kZk?viewPanel=66&var-instance={{ $labels.instance }}"
|
|
|
|
summary: "Churn rate is more than 10% on \"{{ $labels.instance }}\" for the last 15m"
|
|
|
|
description: "VM constantly creates new time series on \"{{ $labels.instance }}\".\n
|
|
|
|
This effect is known as Churn Rate.\n
|
|
|
|
High Churn Rate tightly connected with database performance and may
|
|
|
|
result in unexpected OOM's or slow queries."
|
|
|
|
|
2021-03-29 11:38:03 +02:00
|
|
|
- alert: TooHighChurnRate24h
|
|
|
|
expr: |
|
|
|
|
sum(increase(vm_new_timeseries_created_total[24h])) by(instance)
|
|
|
|
>
|
|
|
|
(sum(vm_cache_entries{type="storage/hour_metric_ids"}) by(instance) * 3)
|
|
|
|
for: 15m
|
|
|
|
labels:
|
|
|
|
severity: warning
|
|
|
|
annotations:
|
|
|
|
dashboard: "http://localhost:3000/d/wNf0q_kZk?viewPanel=66&var-instance={{ $labels.instance }}"
|
|
|
|
summary: "Too high number of new series on \"{{ $labels.instance }}\" created over last 24h"
|
|
|
|
description: "The number of created new time series over last 24h is 3x times higher than
|
|
|
|
current number of active series on \"{{ $labels.instance }}\".\n
|
|
|
|
This effect is known as Churn Rate.\n
|
|
|
|
High Churn Rate tightly connected with database performance and may
|
|
|
|
result in unexpected OOM's or slow queries."
|
|
|
|
|
2021-01-11 12:03:15 +01:00
|
|
|
- alert: TooHighSlowInsertsRate
|
|
|
|
expr: |
|
|
|
|
(
|
|
|
|
sum(rate(vm_slow_row_inserts_total[5m])) by(instance)
|
|
|
|
/
|
|
|
|
sum(rate(vm_rows_inserted_total[5m])) by (instance)
|
2022-02-18 12:42:24 +01:00
|
|
|
) > 0.05
|
2021-01-11 12:03:15 +01:00
|
|
|
for: 15m
|
|
|
|
labels:
|
|
|
|
severity: warning
|
|
|
|
annotations:
|
|
|
|
dashboard: "http://localhost:3000/d/wNf0q_kZk?viewPanel=68&var-instance={{ $labels.instance }}"
|
2022-02-18 12:42:24 +01:00
|
|
|
summary: "Percentage of slow inserts is more than 5% on \"{{ $labels.instance }}\" for the last 15m"
|
2021-01-11 12:03:15 +01:00
|
|
|
description: "High rate of slow inserts on \"{{ $labels.instance }}\" may be a sign of resource exhaustion
|
2023-03-20 21:28:33 +01:00
|
|
|
for the current load. It is likely more RAM is needed for optimal handling of the current number of active time series.
|
|
|
|
See also https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3976#issuecomment-1476883183"
|
2021-01-11 12:03:15 +01:00
|
|
|
|
2021-06-09 11:15:36 +02:00
|
|
|
- alert: LabelsLimitExceededOnIngestion
|
2023-12-11 15:17:30 +01:00
|
|
|
expr: increase(vm_metrics_with_dropped_labels_total[5m]) > 0
|
2021-06-09 11:15:36 +02:00
|
|
|
for: 15m
|
|
|
|
labels:
|
|
|
|
severity: warning
|
|
|
|
annotations:
|
2021-12-02 13:43:30 +01:00
|
|
|
dashboard: "http://localhost:3000/d/wNf0q_kZk?viewPanel=74&var-instance={{ $labels.instance }}"
|
2021-06-09 11:15:36 +02:00
|
|
|
summary: "Metrics ingested in ({{ $labels.instance }}) are exceeding labels limit"
|
|
|
|
description: "VictoriaMetrics limits the number of labels per each metric with `-maxLabelsPerTimeseries` command-line flag.\n
|
2024-04-30 10:24:31 +02:00
|
|
|
This prevents ingestion of metrics with too many labels. Please verify that `-maxLabelsPerTimeseries` is configured
|
2024-05-24 14:55:20 +02:00
|
|
|
correctly or that clients which send these metrics aren't misbehaving."
|