diff --git a/deployment/docker/alerts.yml b/deployment/docker/alerts.yml index f61bef0ee1..bc8780e79b 100644 --- a/deployment/docker/alerts.yml +++ b/deployment/docker/alerts.yml @@ -19,11 +19,31 @@ groups: expr: up{job=~"victoriametrics|vmagent|vmalert"} == 0 for: 2m labels: - severity: "critical" + severity: critical annotations: summary: "Service {{ $labels.job }} is down on {{ $labels.instance }}" description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 2 minutes." + - alert: ProcessNearFDLimits + expr: (process_max_fds - process_open_fds) < 100 + for: 5m + labels: + severity: critical + annotations: + summary: "Number of free file descriptors is less than 100 for \"{{ $labels.job }}\"(\"{{ $labels.instance }}\") for the last 5m" + description: "Exhausting OS file descriptors limit can cause severe degradation of the process. + Consider to increase the limit as fast as possible." + + - alert: TooHighMemoryUsage + expr: (process_resident_memory_anon_bytes / vm_available_memory_bytes) > 0.9 + for: 5m + labels: + severity: critical + annotations: + summary: "It is more than 90% of memory used by \"{{ $labels.job }}\"(\"{{ $labels.instance }}\") during the last 5m" + description: "Too high memory usage may result into multiple issues such as OOMs or degraded performance. + Consider to either increase available memory or decrease the load on the process." + # Alerts group for VM single assumes that Grafana dashboard # https://grafana.com/grafana/dashboards/10229 is installed. # Pls update the `dashboard` annotation according to your setup. @@ -166,17 +186,6 @@ groups: description: "High rate of slow inserts on \"{{ $labels.instance }}\" may be a sign of resource exhaustion for the current load. It is likely more RAM is needed for optimal handling of the current number of active time series." - - alert: ProcessNearFDLimits - expr: (process_max_fds - process_open_fds) < 100 - for: 5m - labels: - severity: critical - annotations: - dashboard: "http://localhost:3000/d/wNf0q_kZk?viewPanel=75&var-instance={{ $labels.instance }}" - summary: "Number of free file descriptors is less than 100 for \"{{ $labels.job }}\"(\"{{ $labels.instance }}\") for the last 5m" - description: "Exhausting OS file descriptors limit can cause severe degradation of the process. - Consider to increase the limit as fast as possible." - - alert: LabelsLimitExceededOnIngestion expr: sum(increase(vm_metrics_with_dropped_labels_total[5m])) by (instance) > 0 for: 15m