deployment docker: use line formatting in alerts-health fixes #6393 (#6394)

### Describe Your Changes Please provide a brief description of the changes you made. Be as specific as possible to help others understand the purpose and impact of your modifications. ### Checklist The following checks are **mandatory**: - [x] My change adheres [VictoriaMetrics contributing guidelines](https://docs.victoriametrics.com/contributing/). Signed-off-by: Artem Navoiev <tenmozes@gmail.com>
2024-11-23 12:31:07 +01:00 · 2024-06-03 02:31:53 -07:00 · 2024-06-03 02:31:53 -07:00 · 508946ed9d
commit 508946ed9d
parent 53422797a7
1 changed files with 35 additions and 25 deletions
--- a/deployment/docker/alerts-health.yml
+++ b/deployment/docker/alerts-health.yml
@ -12,8 +12,9 @@ groups:
          severity: critical
        annotations:
          summary: "{{ $labels.job }} too many restarts (instance {{ $labels.instance }})"
-          description: "Job {{ $labels.job }} (instance {{ $labels.instance }}) has restarted more than twice in the last 15 minutes.
+          description: >
-            It might be crashlooping."
+            Job {{ $labels.job }} (instance {{ $labels.instance }}) has restarted more than twice in the last 15 minutes.
            It might be crashlooping.
      - alert: ServiceDown
        expr: up{job=~".*(victoriametrics|vmselect|vminsert|vmstorage|vmagent|vmalert|vmsingle|vmalertmanager|vmauth).*"} == 0
@ -31,8 +32,9 @@ groups:
          severity: critical
        annotations:
          summary: "Number of free file descriptors is less than 100 for \"{{ $labels.job }}\"(\"{{ $labels.instance }}\") for the last 5m"
-          description: "Exhausting OS file descriptors limit can cause severe degradation of the process.
+          description: | 
-          Consider to increase the limit as fast as possible."
+            Exhausting OS file descriptors limit can cause severe degradation of the process.
            Consider to increase the limit as fast as possible.
      - alert: TooHighMemoryUsage
        expr: (min_over_time(process_resident_memory_anon_bytes[10m]) / vm_available_memory_bytes) > 0.8
@ -41,8 +43,9 @@ groups:
          severity: critical
        annotations:
          summary: "It is more than 80% of memory used by \"{{ $labels.job }}\"(\"{{ $labels.instance }}\")"
-          description: "Too high memory usage may result into multiple issues such as OOMs or degraded performance.
+          description: |
-           Consider to either increase available memory or decrease the load on the process."
+            Too high memory usage may result into multiple issues such as OOMs or degraded performance.
            Consider to either increase available memory or decrease the load on the process.
      - alert: TooHighCPUUsage
        expr: rate(process_cpu_seconds_total[5m]) / process_cpu_cores_available > 0.9
@ -51,8 +54,9 @@ groups:
          severity: critical
        annotations:
          summary: "More than 90% of CPU is used by \"{{ $labels.job }}\"(\"{{ $labels.instance }}\") during the last 5m"
-          description: "Too high CPU usage may be a sign of insufficient resources and make process unstable.
+          description: >
-               Consider to either increase available CPU resources or decrease the load on the process."
+            Too high CPU usage may be a sign of insufficient resources and make process unstable.
            Consider to either increase available CPU resources or decrease the load on the process.
      - alert: TooManyLogs
        expr: sum(increase(vm_log_messages_total{level="error"}[5m])) without (app_version, location) > 0
@ -61,8 +65,9 @@ groups:
          severity: warning
        annotations:
          summary: "Too many logs printed for job \"{{ $labels.job }}\" ({{ $labels.instance }})"
-          description: "Logging rate for job \"{{ $labels.job }}\" ({{ $labels.instance }}) is {{ $value }} for last 15m.\n
+          description: >
-         Worth to check logs for specific error messages."
+            Logging rate for job \"{{ $labels.job }}\" ({{ $labels.instance }}) is {{ $value }} for last 15m.
            Worth to check logs for specific error messages.
      - alert: TooManyTSIDMisses
        expr: rate(vm_missing_tsids_for_metric_id_total[5m]) > 0
@ -71,9 +76,10 @@ groups:
          severity: critical
        annotations:
          summary: "Too many TSID misses for job \"{{ $labels.job }}\" ({{ $labels.instance }})"
-          description: "The rate of TSID misses during query lookups is too high for \"{{ $labels.job }}\" ({{ $labels.instance }}).\n
+          description: | 
-         Make sure you're running VictoriaMetrics of v1.85.3 or higher.\n
+            The rate of TSID misses during query lookups is too high for \"{{ $labels.job }}\" ({{ $labels.instance }}).
-         Related issue https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3502"
+            Make sure you're running VictoriaMetrics of v1.85.3 or higher.
            Related issue https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3502
      - alert: ConcurrentInsertsHitTheLimit
        expr: avg_over_time(vm_concurrent_insert_current[1m]) >= vm_concurrent_insert_capacity
@ -82,11 +88,12 @@ groups:
          severity: warning
        annotations:
          summary: "{{ $labels.job }} on instance {{ $labels.instance }} is constantly hitting concurrent inserts limit"
-          description: "The limit of concurrent inserts on instance {{ $labels.instance }} depends on the number of CPUs.\n
+          description: | 
            The limit of concurrent inserts on instance {{ $labels.instance }} depends on the number of CPUs.
            Usually, when component constantly hits the limit it is likely the component is overloaded and requires more CPU.
            In some cases for components like vmagent or vminsert the alert might trigger if there are too many clients
            making write attempts. If vmagent's or vminsert's CPU usage and network saturation are at normal level, then 
-            it might be worth adjusting `-maxConcurrentInserts` cmd-line flag."
+            it might be worth adjusting `-maxConcurrentInserts` cmd-line flag.
      - alert: IndexDBRecordsDrop
        expr: increase(vm_indexdb_items_dropped_total[5m]) > 0
@ -94,10 +101,11 @@ groups:
          severity: critical
        annotations:
          summary: "IndexDB skipped registering items during data ingestion with reason={{ $labels.reason }}."
-          description: "VictoriaMetrics could skip registering new timeseries during ingestion if they fail the validation process. 
+          description: | 
-          For example, `reason=too_long_item` means that time series cannot exceed 64KB. Please, reduce the number 
+            VictoriaMetrics could skip registering new timeseries during ingestion if they fail the validation process. 
-          of labels or label values for such series. Or enforce these limits via `-maxLabelsPerTimeseries` and 
+            For example, `reason=too_long_item` means that time series cannot exceed 64KB. Please, reduce the number 
-          `-maxLabelValueLen` command-line flags."
+            of labels or label values for such series. Or enforce these limits via `-maxLabelsPerTimeseries` and 
            `-maxLabelValueLen` command-line flags.
      - alert: TooLongLabelValues
        expr: increase(vm_too_long_label_values_total[5m]) > 0
@ -105,10 +113,11 @@ groups:
          severity: critical
        annotations:
          summary: "VictoriaMetrics truncates too long label values"
-          description: "The maximum length of a label value is limited via `-maxLabelValueLen` cmd-line flag. 
+          description: | 
-           Longer label values are truncated and may result into time series overlapping.
+            The maximum length of a label value is limited via `-maxLabelValueLen` cmd-line flag. 
-           Please, check your logs to find which labels were truncated and  
+            Longer label values are truncated and may result into time series overlapping.
-           either reduce the size of label values or increase `-maxLabelValueLen`".
+            Please, check your logs to find which labels were truncated and  
            either reduce the size of label values or increase `-maxLabelValueLen`.
      - alert: TooLongLabelNames
        expr: increase(vm_too_long_label_names_total[5m]) > 0
@ -116,5 +125,6 @@ groups:
          severity: critical
        annotations:
          summary: "VictoriaMetrics truncates too long label names"
-          description: "The maximum length of a label name is limited by 256 bytes. 
+          description: > 
-           Longer label names are truncated and may result into time series overlapping.".
+           The maximum length of a label name is limited by 256 bytes. 
           Longer label names are truncated and may result into time series overlapping.