2021-04-05 21:29:04 +02:00
|
|
|
# File contains default list of alerts for VM cluster and vmagent services.
|
|
|
|
# The alerts below are just recommendations and may require some updates
|
|
|
|
# and threshold calibration according to every specific setup.
|
|
|
|
groups:
|
|
|
|
- name: serviceHealth
|
2021-04-08 17:23:10 +02:00
|
|
|
# note the `job` filter and update accordingly to your setup
|
2021-04-05 21:29:04 +02:00
|
|
|
rules:
|
|
|
|
- alert: TooManyRestarts
|
|
|
|
expr: changes(process_start_time_seconds{job=~"vmselect|vminsert|vmstorage|vmagent|vmalert"}[15m]) > 2
|
|
|
|
labels:
|
|
|
|
severity: critical
|
|
|
|
annotations:
|
|
|
|
summary: "{{ $labels.job }} too many restarts (instance {{ $labels.instance }})"
|
|
|
|
description: "Job {{ $labels.job }} (instance {{ $labels.instance }}) has restarted more than twice in the last 15 minutes.
|
|
|
|
It might be crashlooping."
|
|
|
|
|
2021-04-08 17:23:10 +02:00
|
|
|
- alert: ServiceDown
|
|
|
|
expr: up{job=~"vmselect|vminsert|vmstorage|vmagent|vmalert"} == 0
|
|
|
|
for: 2m
|
|
|
|
labels:
|
|
|
|
severity: "critical"
|
|
|
|
annotations:
|
|
|
|
summary: "Service {{ $labels.job }} is down on {{ $labels.instance }}"
|
|
|
|
description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 2 minutes."
|
|
|
|
|
2021-04-05 21:29:04 +02:00
|
|
|
# Alerts group for VM cluster assumes that Grafana dashboard
|
|
|
|
# https://grafana.com/grafana/dashboards/11176 is installed.
|
|
|
|
# Please, update the `dashboard` annotation according to your setup.
|
|
|
|
- name: vmcluster
|
|
|
|
interval: 30s
|
|
|
|
concurrency: 2
|
|
|
|
rules:
|
|
|
|
- alert: DiskRunsOutOfSpaceIn3Days
|
|
|
|
expr: |
|
|
|
|
vm_free_disk_space_bytes / ignoring(path) (
|
|
|
|
(
|
|
|
|
sum(rate(vm_rows_added_to_storage_total[1d])) -
|
|
|
|
sum(rate(vm_deduplicated_samples_total[1d])) without(type)
|
|
|
|
)
|
|
|
|
*
|
|
|
|
(
|
|
|
|
sum(vm_data_size_bytes{type!="indexdb"}) /
|
|
|
|
sum(vm_rows{type!="indexdb"})
|
|
|
|
)
|
|
|
|
) < 3 * 24 * 3600
|
|
|
|
for: 30m
|
|
|
|
labels:
|
|
|
|
severity: critical
|
|
|
|
annotations:
|
|
|
|
dashboard: "http://localhost:3000/d/oS7Bi_0Wz?viewPanel=113&var-instance={{ $labels.instance }}"
|
|
|
|
summary: "Instance {{ $labels.instance }} will run out of disk space in 3 days"
|
|
|
|
description: "Taking into account current ingestion rate, free disk space will be enough only
|
|
|
|
for {{ $value | humanizeDuration }} on instance {{ $labels.instance }}.\n
|
|
|
|
Consider to limit the ingestion rate, decrease retention or scale the disk space up if possible."
|
|
|
|
|
|
|
|
- alert: DiskRunsOutOfSpace
|
|
|
|
expr: |
|
|
|
|
sum(vm_data_size_bytes) by(instance) /
|
|
|
|
(
|
|
|
|
sum(vm_free_disk_space_bytes) by(instance) +
|
|
|
|
sum(vm_data_size_bytes) by(instance)
|
|
|
|
) > 0.8
|
|
|
|
for: 30m
|
|
|
|
labels:
|
|
|
|
severity: critical
|
|
|
|
annotations:
|
|
|
|
dashboard: http://localhost:3000/d/oS7Bi_0Wz?viewPanel=110&var-instance={{ $labels.instance }}"
|
|
|
|
summary: "Instance {{ $labels.instance }} will run out of disk space soon"
|
|
|
|
description: "Disk utilisation on instance {{ $labels.instance }} is more than 80%.\n
|
|
|
|
Having less than 20% of free disk space could cripple merges processes and overall performance.
|
|
|
|
Consider to limit the ingestion rate, decrease retention or scale the disk space if possible."
|
|
|
|
|
|
|
|
- alert: RequestErrorsToAPI
|
|
|
|
expr: increase(vm_http_request_errors_total[5m]) > 0
|
|
|
|
for: 15m
|
|
|
|
labels:
|
|
|
|
severity: warning
|
|
|
|
annotations:
|
|
|
|
dashboard: "http://localhost:3000/d/oS7Bi_0Wz?viewPanel=52&var-instance={{ $labels.instance }}"
|
|
|
|
summary: "Too many errors served for {{ $labels.job }} path {{ $labels.path }} (instance {{ $labels.instance }})"
|
|
|
|
description: "Requests to path {{ $labels.path }} are receiving errors.
|
|
|
|
Please verify if clients are sending correct requests."
|
|
|
|
|
|
|
|
- alert: RPCErrors
|
2021-04-13 07:10:23 +02:00
|
|
|
expr: |
|
|
|
|
(
|
|
|
|
sum(increase(vm_rpc_connection_errors_total[5m])) by(job, instance)
|
|
|
|
+
|
|
|
|
sum(increase(vm_rpc_dial_errors_total[5m])) by(job, instance)
|
|
|
|
+
|
|
|
|
sum(increase(vm_rpc_handshake_errors_total[5m])) by(job, instance)
|
|
|
|
+
|
|
|
|
sum(increase(vm_rpc_reroute_errors_total[5m])) by(job, instance)
|
|
|
|
) > 0
|
2021-04-05 21:29:04 +02:00
|
|
|
for: 15m
|
|
|
|
labels:
|
|
|
|
severity: warning
|
|
|
|
annotations:
|
|
|
|
dashboard: "http://localhost:3000/d/oS7Bi_0Wz?viewPanel=44&var-instance={{ $labels.instance }}"
|
|
|
|
summary: "Too many RPC errors for {{ $labels.job }} (instance {{ $labels.instance }})"
|
|
|
|
description: "RPC errors are interconnection errors between cluster components.\n
|
|
|
|
Possible reasons for errors are misconfiguration, overload, network blips or unreachable components."
|
|
|
|
|
|
|
|
- alert: ConcurrentFlushesHitTheLimit
|
|
|
|
expr: avg_over_time(vm_concurrent_addrows_current[1m]) >= vm_concurrent_addrows_capacity
|
|
|
|
for: 15m
|
|
|
|
labels:
|
|
|
|
severity: warning
|
|
|
|
annotations:
|
|
|
|
dashboard: "http://localhost:3000/d/oS7Bi_0Wz?viewPanel=133&var-instance={{ $labels.instance }}"
|
|
|
|
summary: "vmstorage on instance {{ $labels.instance }} is constantly hitting concurrent flushes limit"
|
|
|
|
description: "The limit of concurrent flushes on instance {{ $labels.instance }} is equal to number of CPUs.\n
|
|
|
|
When vmstorage constantly hits the limit it means that storage is overloaded and requires more CPU."
|
|
|
|
|
|
|
|
- alert: TooManyLogs
|
|
|
|
expr: sum(increase(vm_log_messages_total{level!="info"}[5m])) by (job, instance) > 0
|
|
|
|
for: 15m
|
|
|
|
labels:
|
|
|
|
severity: warning
|
|
|
|
annotations:
|
|
|
|
dashboard: "http://localhost:3000/d/oS7Bi_0Wz?viewPanel=104&var-instance={{ $labels.instance }}"
|
|
|
|
summary: "Too many logs printed for job \"{{ $labels.job }}\" ({{ $labels.instance }})"
|
|
|
|
description: "Logging rate for job \"{{ $labels.job }}\" ({{ $labels.instance }}) is {{ $value }} for last 15m.\n
|
|
|
|
Worth to check logs for specific error messages."
|
|
|
|
|
|
|
|
- alert: RowsRejectedOnIngestion
|
|
|
|
expr: sum(rate(vm_rows_ignored_total[5m])) by (instance, reason) > 0
|
|
|
|
for: 15m
|
|
|
|
labels:
|
|
|
|
severity: warning
|
|
|
|
annotations:
|
|
|
|
dashboard: "http://localhost:3000/d/oS7Bi_0Wz?viewPanel=135&var-instance={{ $labels.instance }}"
|
|
|
|
summary: "Some rows are rejected on \"{{ $labels.instance }}\" on ingestion attempt"
|
|
|
|
description: "VM is rejecting to ingest rows on \"{{ $labels.instance }}\" due to the
|
|
|
|
following reason: \"{{ $labels.reason }}\""
|
|
|
|
|
|
|
|
- alert: TooHighChurnRate
|
|
|
|
expr: |
|
|
|
|
(
|
|
|
|
sum(rate(vm_new_timeseries_created_total[5m]))
|
|
|
|
/
|
|
|
|
sum(rate(vm_rows_inserted_total[5m]))
|
|
|
|
) > 0.1
|
|
|
|
for: 15m
|
|
|
|
labels:
|
|
|
|
severity: warning
|
|
|
|
annotations:
|
|
|
|
dashboard: "http://localhost:3000/d/oS7Bi_0Wz?viewPanel=102"
|
|
|
|
summary: "Churn rate is more than 10% for the last 15m"
|
|
|
|
description: "VM constantly creates new time series.\n
|
|
|
|
This effect is known as Churn Rate.\n
|
|
|
|
High Churn Rate tightly connected with database performance and may
|
|
|
|
result in unexpected OOM's or slow queries."
|
|
|
|
|
|
|
|
- alert: TooHighChurnRate24h
|
|
|
|
expr: |
|
|
|
|
sum(increase(vm_new_timeseries_created_total[24h]))
|
|
|
|
>
|
|
|
|
(sum(vm_cache_entries{type="storage/hour_metric_ids"})* 3)
|
|
|
|
for: 15m
|
|
|
|
labels:
|
|
|
|
severity: warning
|
|
|
|
annotations:
|
|
|
|
dashboard: "http://localhost:3000/d/oS7Bi_0Wz?viewPanel=102"
|
|
|
|
summary: "Too high number of new series created over last 24h"
|
|
|
|
description: "The number of created new time series over last 24h is 3x times higher than
|
|
|
|
current number of active series.\n
|
|
|
|
This effect is known as Churn Rate.\n
|
|
|
|
High Churn Rate tightly connected with database performance and may
|
|
|
|
result in unexpected OOM's or slow queries."
|
|
|
|
|
|
|
|
- alert: TooHighSlowInsertsRate
|
|
|
|
expr: |
|
|
|
|
(
|
|
|
|
sum(rate(vm_slow_row_inserts_total[5m]))
|
|
|
|
/
|
|
|
|
sum(rate(vm_rows_inserted_total[5m]))
|
|
|
|
) > 0.05
|
|
|
|
for: 15m
|
|
|
|
labels:
|
|
|
|
severity: warning
|
|
|
|
annotations:
|
|
|
|
dashboard: "http://localhost:3000/d/oS7Bi_0Wz?viewPanel=108"
|
|
|
|
summary: "Percentage of slow inserts is more than 5% for the last 15m"
|
|
|
|
description: "High rate of slow inserts may be a sign of resource exhaustion
|
|
|
|
for the current load. It is likely more RAM is needed for optimal handling of the current number of active time series."
|
|
|
|
|
|
|
|
- alert: ProcessNearFDLimits
|
|
|
|
expr: (process_max_fds - process_open_fds) < 100
|
|
|
|
for: 5m
|
|
|
|
labels:
|
|
|
|
severity: critical
|
|
|
|
annotations:
|
|
|
|
dashboard: "http://localhost:3000/d/oS7Bi_0Wz?viewPanel=117&var-instance={{ $labels.instance }}"
|
|
|
|
summary: "Number of free file descriptors is less than 100 for \"{{ $labels.job }}\"(\"{{ $labels.instance }}\") for the last 5m"
|
|
|
|
description: "Exhausting OS file descriptors limit can cause severe degradation of the process.
|
|
|
|
Consider to increase the limit as fast as possible."
|
|
|
|
|
|
|
|
- alert: LabelsLimitExceededOnIngestion
|
|
|
|
expr: sum(increase(vm_metrics_with_dropped_labels_total[5m])) by (instance) > 0
|
|
|
|
for: 15m
|
|
|
|
labels:
|
|
|
|
severity: warning
|
|
|
|
annotations:
|
|
|
|
dashboard: "http://localhost:3000/d/oS7Bi_0Wz?viewPanel=116&var-instance={{ $labels.instance }}"
|
|
|
|
summary: "Metrics ingested to vminsert on ({{ $labels.instance }}) are exceeding labels limit"
|
|
|
|
description: "VictoriaMetrics limits the number of labels per each metric with `-maxLabelsPerTimeseries` command-line flag.\n
|
|
|
|
This prevents from ingesting metrics with too many labels. Please verify that `-maxLabelsPerTimeseries` is configured
|
|
|
|
correctly or that clients which send these metrics aren't misbehaving."
|
|
|
|
|
|
|
|
- alert: VminsertIsDroppingRows
|
|
|
|
expr: sum(rate(vm_rpc_rows_lost_total[5m])) by(instance) > 0
|
|
|
|
for: 15m
|
|
|
|
labels:
|
|
|
|
severity: warning
|
|
|
|
annotations:
|
|
|
|
dashboard: "http://localhost:3000/d/oS7Bi_0Wz?viewPanel=84&var-instance={{ $labels.instance }}"
|
|
|
|
summary: "VMinsert \"{{ $labels.job }}\" on instance {{ $labels.instance }} drops rows due to RPC errors."
|
|
|
|
description: "VMinsert starts to drop rows if there are no healthy VMstorage nodes where it can route
|
|
|
|
insert requests to. Check the health state of VMstorage nodes and RPC metrics."
|
|
|
|
|
|
|
|
|
|
|
|
# Alerts group for vmagent assumes that Grafana dashboard
|
|
|
|
# https://grafana.com/grafana/dashboards/12683 is installed.
|
|
|
|
# Pls update the `dashboard` annotation according to your setup.
|
|
|
|
- name: vmagent
|
|
|
|
interval: 30s
|
|
|
|
concurrency: 2
|
|
|
|
rules:
|
|
|
|
- alert: PersistentQueueIsDroppingData
|
|
|
|
expr: sum(increase(vm_persistentqueue_bytes_dropped_total[5m])) by (job, instance) > 0
|
|
|
|
for: 10m
|
|
|
|
labels:
|
|
|
|
severity: critical
|
|
|
|
annotations:
|
|
|
|
dashboard: "http://localhost:3000/d/G7Z9GzMGz?viewPanel=49&var-instance={{ $labels.instance }}"
|
|
|
|
summary: "Instance {{ $labels.instance }} is dropping data from persistent queue"
|
|
|
|
description: "Vmagent dropped {{ $value | humanize1024 }} from persistent queue
|
|
|
|
on instance {{ $labels.instance }} for the last 10m."
|
|
|
|
|
|
|
|
- alert: TooManyScrapeErrors
|
|
|
|
expr: sum(increase(vm_promscrape_scrapes_failed_total[5m])) by (job, instance) > 0
|
|
|
|
for: 15m
|
|
|
|
labels:
|
|
|
|
severity: warning
|
|
|
|
annotations:
|
|
|
|
dashboard: "http://localhost:3000/d/G7Z9GzMGz?viewPanel=31&var-instance={{ $labels.instance }}"
|
|
|
|
summary: "Job \"{{ $labels.job }}\" on instance {{ $labels.instance }} fails to scrape targets for last 15m"
|
|
|
|
|
|
|
|
- alert: TooManyWriteErrors
|
|
|
|
expr: |
|
|
|
|
(sum(increase(vm_ingestserver_request_errors_total[5m])) by (job, instance)
|
|
|
|
+
|
|
|
|
sum(increase(vmagent_http_request_errors_total[5m])) by (job, instance)) > 0
|
|
|
|
for: 15m
|
|
|
|
labels:
|
|
|
|
severity: warning
|
|
|
|
annotations:
|
|
|
|
dashboard: "http://localhost:3000/d/G7Z9GzMGz?viewPanel=77&var-instance={{ $labels.instance }}"
|
|
|
|
summary: "Job \"{{ $labels.job }}\" on instance {{ $labels.instance }} responds with errors to write requests for last 15m."
|
|
|
|
|
|
|
|
- alert: TooManyRemoteWriteErrors
|
|
|
|
expr: sum(rate(vmagent_remotewrite_retries_count_total[5m])) by(job, instance, url) > 0
|
|
|
|
for: 15m
|
|
|
|
labels:
|
|
|
|
severity: warning
|
|
|
|
annotations:
|
|
|
|
dashboard: "http://localhost:3000/d/G7Z9GzMGz?viewPanel=61&var-instance={{ $labels.instance }}"
|
|
|
|
summary: "Job \"{{ $labels.job }}\" on instance {{ $labels.instance }} fails to push to remote storage"
|
|
|
|
description: "Vmagent fails to push data via remote write protocol to destination \"{{ $labels.url }}\"\n
|
|
|
|
Ensure that destination is up and reachable."
|