From 1bff6c1bbdd45ed242d58631b207359778e8861f Mon Sep 17 00:00:00 2001 From: Hui Wang Date: Wed, 13 Nov 2024 00:00:39 +0800 Subject: [PATCH] dashboards: add `file` label filter to vmalert dashboard panels (#7515) Previously, metrics from groups with the same name but in different files could be mixed in the results. e.g. the evaluation time [here](https://grafana.maas.victoriametrics.com/d/LzldHAVnz/victoriametrics-vmalert?orgId=1&var-ds=PE8D8DB4BEE4E4B22&var-job=All&var-instance=All&var-file=%2Fetc%2Fvmalert%2Fconfig%2Fvm-per-tenant-rulefiles-0%2Fmaas-tenant-1011-maas-1011-vm-health.yaml&var-group=All&var-topk=5&editPanel=23) is the total for multiple groups from different tenants. --- dashboards/vm/vmalert.json | 61 +++++++++++++++------- dashboards/vmalert.json | 61 +++++++++++++++------- deployment/docker/rules/alerts-vmalert.yml | 14 ++--- docs/changelog/CHANGELOG.md | 1 + 4 files changed, 92 insertions(+), 45 deletions(-) diff --git a/dashboards/vm/vmalert.json b/dashboards/vm/vmalert.json index 61fa885378..f0eb768812 100644 --- a/dashboards/vm/vmalert.json +++ b/dashboards/vm/vmalert.json @@ -276,7 +276,7 @@ "uid": "$ds" }, "exemplar": false, - "expr": "count(vmalert_alerting_rules_last_evaluation_samples{job=~\"$job\", instance=~\"$instance\", group=~\"$group\"})", + "expr": "count(vmalert_alerting_rules_last_evaluation_samples{job=~\"$job\", instance=~\"$instance\", group=~\"$group\", file=~\"$file\"})", "interval": "", "legendFormat": "", "refId": "A" @@ -338,7 +338,7 @@ "uid": "$ds" }, "exemplar": false, - "expr": "count(vmalert_recording_rules_last_evaluation_samples{job=~\"$job\", instance=~\"$instance\", group=~\"$group\"})", + "expr": "count(vmalert_recording_rules_last_evaluation_samples{job=~\"$job\", instance=~\"$instance\", group=~\"$group\", file=~\"$file\"})", "interval": "", "legendFormat": "", "refId": "A" @@ -404,7 +404,7 @@ "uid": "$ds" }, "exemplar": false, - "expr": "(sum(increase(vmalert_alerting_rules_errors_total{job=~\"$job\", instance=~\"$instance\", group=~\"$group\"}[$__rate_interval])) or vector(0)) + \n(sum(increase(vmalert_recording_rules_errors_total{job=~\"$job\", instance=~\"$instance\", group=~\"$group\"}[$__rate_interval])) or vector(0))", + "expr": "(sum(increase(vmalert_alerting_rules_errors_total{job=~\"$job\", instance=~\"$instance\", group=~\"$group\", file=~\"$file\"}[$__rate_interval])) or vector(0)) + \n(sum(increase(vmalert_recording_rules_errors_total{job=~\"$job\", instance=~\"$instance\", group=~\"$group\", file=~\"$file\"}[$__rate_interval])) or vector(0))", "interval": "", "legendFormat": "", "refId": "A" @@ -910,9 +910,9 @@ }, "editorMode": "code", "exemplar": false, - "expr": "topk_max($topk, max(sum(\n rate(vmalert_iteration_duration_seconds_sum{job=~\"$job\", instance=~\"$instance\", group=~\"$group\"}[$__rate_interval])\n/\n rate(vmalert_iteration_duration_seconds_count{job=~\"$job\", instance=~\"$instance\", group=~\"$group\"}[$__rate_interval])\n) by(job, instance, group)) \nby(job, group))", + "expr": "topk_max($topk, max(sum(\n rate(vmalert_iteration_duration_seconds_sum{job=~\"$job\", instance=~\"$instance\", group=~\"$group\", file=~\"$file\"}[$__rate_interval])\n/\n rate(vmalert_iteration_duration_seconds_count{job=~\"$job\", instance=~\"$instance\", group=~\"$group\", file=~\"$file\"}[$__rate_interval])\n) by(job, instance, group, file)) \nby(job, group, file))", "interval": "", - "legendFormat": "{{group}} ({{job}})", + "legendFormat": "({{job}}) {{group}}({{file}})", "range": true, "refId": "A" } @@ -2292,9 +2292,9 @@ }, "editorMode": "code", "exemplar": false, - "expr": "sum(increase(vmalert_iteration_missed_total{job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])) by(job, group) > 0", + "expr": "sum(increase(vmalert_iteration_missed_total{job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])) by(job, group, file) > 0", "interval": "1m", - "legendFormat": "__auto", + "legendFormat": "({{job}}) {{group}}({{file}})", "range": true, "refId": "A" } @@ -2517,9 +2517,9 @@ }, "editorMode": "code", "exemplar": false, - "expr": "topk_max($topk, sum(vmalert_alerts_firing{job=~\"$job\", instance=~\"$instance\", group=~\"$group\"}) by(job, group, alertname) > 0)", + "expr": "topk_max($topk, sum(vmalert_alerts_firing{job=~\"$job\", instance=~\"$instance\", group=~\"$group\", file=~\"$file\"}) by(job, group, file, alertname) > 0)", "interval": "", - "legendFormat": "{{group}}.{{alertname}} ({{job}})", + "legendFormat": "({{job}}) {{group}}.{{alertname}}({{file}})", "range": true, "refId": "A" } @@ -2619,9 +2619,9 @@ }, "editorMode": "code", "exemplar": false, - "expr": "sum(increase(vmalert_alerting_rules_errors_total{job=~\"$job\", instance=~\"$instance\", group=~\"$group\"}[$__rate_interval])) by(job, group, alertname) > 0", + "expr": "sum(increase(vmalert_alerting_rules_errors_total{job=~\"$job\", instance=~\"$instance\", group=~\"$group\", file=~\"$file\"}[$__rate_interval])) by(job, group, file, alertname) > 0", "interval": "", - "legendFormat": "{{group}}.{{alertname}} ({{job}})", + "legendFormat": "({{job}}) {{group}}.{{alertname}}({{file}})", "range": true, "refId": "A" } @@ -2721,9 +2721,9 @@ }, "editorMode": "code", "exemplar": false, - "expr": "sum(vmalert_alerts_pending{job=~\"$job\", instance=~\"$instance\", group=~\"$group\"}) by(job, group, alertname) > 0", + "expr": "sum(vmalert_alerts_pending{job=~\"$job\", instance=~\"$instance\", group=~\"$group\", file=~\"$file\"}) by(job, group, file, alertname) > 0", "interval": "", - "legendFormat": "{{group}}.{{alertname}} ({{job}})", + "legendFormat": "({{job}}) {{group}}.{{alertname}}({{file}})", "range": true, "refId": "A" } @@ -3050,9 +3050,9 @@ }, "editorMode": "code", "exemplar": false, - "expr": "topk_max($topk, \n max(\n sum(vmalert_recording_rules_last_evaluation_samples{job=~\"$job\", instance=~\"$instance\", group=~\"$group\"}) by(job, instance, group, recording) > 0\n ) by(job, group, recording)\n)", + "expr": "topk_max($topk, \n max(\n sum(vmalert_recording_rules_last_evaluation_samples{job=~\"$job\", instance=~\"$instance\", group=~\"$group\", file=~\"$file\"}) by(job, instance, group, file, recording) > 0\n ) by(job, group, file, recording)\n)", "interval": "", - "legendFormat": "{{group}}.{{recording}} ({{job}})", + "legendFormat": "({{job}}) {{group}}.{{recording}}({{file}})", "range": true, "refId": "A" } @@ -3152,9 +3152,9 @@ }, "editorMode": "code", "exemplar": false, - "expr": "count(vmalert_recording_rules_last_evaluation_samples{job=~\"$job\", instance=~\"$instance\", group=~\"$group\"} < 1) by(job, group, recording)", + "expr": "count(vmalert_recording_rules_last_evaluation_samples{job=~\"$job\", instance=~\"$instance\", group=~\"$group\", file=~\"$file\"} < 1) by(job, group, file, recording)", "interval": "", - "legendFormat": "{{group}}.{{recording}} ({{job}})", + "legendFormat": "({{job}}) {{group}}.{{recording}}({{file}})", "range": true, "refId": "A" } @@ -3251,9 +3251,9 @@ }, "editorMode": "code", "exemplar": false, - "expr": "sum(increase(vmalert_recording_rules_errors_total{job=~\"$job\", instance=~\"$instance\", group=~\"$group\"}[$__rate_interval])) by(job, group, recording) > 0", + "expr": "sum(increase(vmalert_recording_rules_errors_total{job=~\"$job\", instance=~\"$instance\", group=~\"$group\", file=~\"$file\"}[$__rate_interval])) by(job, group, file, recording) > 0", "interval": "", - "legendFormat": "{{group}}.{{recording}} ({{job}})", + "legendFormat": "({{job}}) {{group}}.{{recording}}({{file}})", "range": true, "refId": "A" } @@ -3749,6 +3749,29 @@ "sort": 0, "type": "query" }, + { + "allValue": ".*", + "current": {}, + "datasource": { + "type": "victoriametrics-datasource", + "uid": "$ds" + }, + "definition": "label_values(vmalert_iteration_total{job=~\"$job\", instance=~\"$instance\"},file)", + "hide": 0, + "includeAll": true, + "multi": true, + "name": "file", + "options": [], + "query": { + "query": "label_values(vmalert_iteration_total{job=~\"$job\", instance=~\"$instance\"},file)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "type": "query" + }, { "allValue": ".*", "current": {}, diff --git a/dashboards/vmalert.json b/dashboards/vmalert.json index 929b9b2bb3..77b8df4bb6 100644 --- a/dashboards/vmalert.json +++ b/dashboards/vmalert.json @@ -275,7 +275,7 @@ "uid": "$ds" }, "exemplar": false, - "expr": "count(vmalert_alerting_rules_last_evaluation_samples{job=~\"$job\", instance=~\"$instance\", group=~\"$group\"})", + "expr": "count(vmalert_alerting_rules_last_evaluation_samples{job=~\"$job\", instance=~\"$instance\", group=~\"$group\", file=~\"$file\"})", "interval": "", "legendFormat": "", "refId": "A" @@ -337,7 +337,7 @@ "uid": "$ds" }, "exemplar": false, - "expr": "count(vmalert_recording_rules_last_evaluation_samples{job=~\"$job\", instance=~\"$instance\", group=~\"$group\"})", + "expr": "count(vmalert_recording_rules_last_evaluation_samples{job=~\"$job\", instance=~\"$instance\", group=~\"$group\", file=~\"$file\"})", "interval": "", "legendFormat": "", "refId": "A" @@ -403,7 +403,7 @@ "uid": "$ds" }, "exemplar": false, - "expr": "(sum(increase(vmalert_alerting_rules_errors_total{job=~\"$job\", instance=~\"$instance\", group=~\"$group\"}[$__rate_interval])) or vector(0)) + \n(sum(increase(vmalert_recording_rules_errors_total{job=~\"$job\", instance=~\"$instance\", group=~\"$group\"}[$__rate_interval])) or vector(0))", + "expr": "(sum(increase(vmalert_alerting_rules_errors_total{job=~\"$job\", instance=~\"$instance\", group=~\"$group\", file=~\"$file\"}[$__rate_interval])) or vector(0)) + \n(sum(increase(vmalert_recording_rules_errors_total{job=~\"$job\", instance=~\"$instance\", group=~\"$group\", file=~\"$file\"}[$__rate_interval])) or vector(0))", "interval": "", "legendFormat": "", "refId": "A" @@ -909,9 +909,9 @@ }, "editorMode": "code", "exemplar": false, - "expr": "topk_max($topk, max(sum(\n rate(vmalert_iteration_duration_seconds_sum{job=~\"$job\", instance=~\"$instance\", group=~\"$group\"}[$__rate_interval])\n/\n rate(vmalert_iteration_duration_seconds_count{job=~\"$job\", instance=~\"$instance\", group=~\"$group\"}[$__rate_interval])\n) by(job, instance, group)) \nby(job, group))", + "expr": "topk_max($topk, max(sum(\n rate(vmalert_iteration_duration_seconds_sum{job=~\"$job\", instance=~\"$instance\", group=~\"$group\", file=~\"$file\"}[$__rate_interval])\n/\n rate(vmalert_iteration_duration_seconds_count{job=~\"$job\", instance=~\"$instance\", group=~\"$group\", file=~\"$file\"}[$__rate_interval])\n) by(job, instance, group, file)) \nby(job, group, file))", "interval": "", - "legendFormat": "{{group}} ({{job}})", + "legendFormat": "({{job}}) {{group}}({{file}})", "range": true, "refId": "A" } @@ -2291,9 +2291,9 @@ }, "editorMode": "code", "exemplar": false, - "expr": "sum(increase(vmalert_iteration_missed_total{job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])) by(job, group) > 0", + "expr": "sum(increase(vmalert_iteration_missed_total{job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])) by(job, group, file) > 0", "interval": "1m", - "legendFormat": "__auto", + "legendFormat": "({{job}}) {{group}}({{file}})", "range": true, "refId": "A" } @@ -2516,9 +2516,9 @@ }, "editorMode": "code", "exemplar": false, - "expr": "topk_max($topk, sum(vmalert_alerts_firing{job=~\"$job\", instance=~\"$instance\", group=~\"$group\"}) by(job, group, alertname) > 0)", + "expr": "topk_max($topk, sum(vmalert_alerts_firing{job=~\"$job\", instance=~\"$instance\", group=~\"$group\", file=~\"$file\"}) by(job, group, file, alertname) > 0)", "interval": "", - "legendFormat": "{{group}}.{{alertname}} ({{job}})", + "legendFormat": "({{job}}) {{group}}.{{alertname}}({{file}})", "range": true, "refId": "A" } @@ -2618,9 +2618,9 @@ }, "editorMode": "code", "exemplar": false, - "expr": "sum(increase(vmalert_alerting_rules_errors_total{job=~\"$job\", instance=~\"$instance\", group=~\"$group\"}[$__rate_interval])) by(job, group, alertname) > 0", + "expr": "sum(increase(vmalert_alerting_rules_errors_total{job=~\"$job\", instance=~\"$instance\", group=~\"$group\", file=~\"$file\"}[$__rate_interval])) by(job, group, file, alertname) > 0", "interval": "", - "legendFormat": "{{group}}.{{alertname}} ({{job}})", + "legendFormat": "({{job}}) {{group}}.{{alertname}}({{file}})", "range": true, "refId": "A" } @@ -2720,9 +2720,9 @@ }, "editorMode": "code", "exemplar": false, - "expr": "sum(vmalert_alerts_pending{job=~\"$job\", instance=~\"$instance\", group=~\"$group\"}) by(job, group, alertname) > 0", + "expr": "sum(vmalert_alerts_pending{job=~\"$job\", instance=~\"$instance\", group=~\"$group\", file=~\"$file\"}) by(job, group, file, alertname) > 0", "interval": "", - "legendFormat": "{{group}}.{{alertname}} ({{job}})", + "legendFormat": "({{job}}) {{group}}.{{alertname}}({{file}})", "range": true, "refId": "A" } @@ -3049,9 +3049,9 @@ }, "editorMode": "code", "exemplar": false, - "expr": "topk_max($topk, \n max(\n sum(vmalert_recording_rules_last_evaluation_samples{job=~\"$job\", instance=~\"$instance\", group=~\"$group\"}) by(job, instance, group, recording) > 0\n ) by(job, group, recording)\n)", + "expr": "topk_max($topk, \n max(\n sum(vmalert_recording_rules_last_evaluation_samples{job=~\"$job\", instance=~\"$instance\", group=~\"$group\", file=~\"$file\"}) by(job, instance, group, file, recording) > 0\n ) by(job, group, file, recording)\n)", "interval": "", - "legendFormat": "{{group}}.{{recording}} ({{job}})", + "legendFormat": "({{job}}) {{group}}.{{recording}}({{file}})", "range": true, "refId": "A" } @@ -3151,9 +3151,9 @@ }, "editorMode": "code", "exemplar": false, - "expr": "count(vmalert_recording_rules_last_evaluation_samples{job=~\"$job\", instance=~\"$instance\", group=~\"$group\"} < 1) by(job, group, recording)", + "expr": "count(vmalert_recording_rules_last_evaluation_samples{job=~\"$job\", instance=~\"$instance\", group=~\"$group\", file=~\"$file\"} < 1) by(job, group, file, recording)", "interval": "", - "legendFormat": "{{group}}.{{recording}} ({{job}})", + "legendFormat": "({{job}}) {{group}}.{{recording}}({{file}})", "range": true, "refId": "A" } @@ -3250,9 +3250,9 @@ }, "editorMode": "code", "exemplar": false, - "expr": "sum(increase(vmalert_recording_rules_errors_total{job=~\"$job\", instance=~\"$instance\", group=~\"$group\"}[$__rate_interval])) by(job, group, recording) > 0", + "expr": "sum(increase(vmalert_recording_rules_errors_total{job=~\"$job\", instance=~\"$instance\", group=~\"$group\", file=~\"$file\"}[$__rate_interval])) by(job, group, file, recording) > 0", "interval": "", - "legendFormat": "{{group}}.{{recording}} ({{job}})", + "legendFormat": "({{job}}) {{group}}.{{recording}}({{file}})", "range": true, "refId": "A" } @@ -3748,6 +3748,29 @@ "sort": 0, "type": "query" }, + { + "allValue": ".*", + "current": {}, + "datasource": { + "type": "prometheus", + "uid": "$ds" + }, + "definition": "label_values(vmalert_iteration_total{job=~\"$job\", instance=~\"$instance\"},file)", + "hide": 0, + "includeAll": true, + "multi": true, + "name": "file", + "options": [], + "query": { + "query": "label_values(vmalert_iteration_total{job=~\"$job\", instance=~\"$instance\"},file)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "type": "query" + }, { "allValue": ".*", "current": {}, diff --git a/deployment/docker/rules/alerts-vmalert.yml b/deployment/docker/rules/alerts-vmalert.yml index 182e643373..4f21e4e89c 100644 --- a/deployment/docker/rules/alerts-vmalert.yml +++ b/deployment/docker/rules/alerts-vmalert.yml @@ -23,9 +23,9 @@ groups: labels: severity: warning annotations: - dashboard: "http://localhost:3000/d/LzldHAVnz?viewPanel=13&var-instance={{ $labels.instance }}&var-group={{ $labels.group }}" + dashboard: "http://localhost:3000/d/LzldHAVnz?viewPanel=13&var-instance={{ $labels.instance }}&var-file={{ $labels.file }}&var-group={{ $labels.group }}" summary: "Alerting rules are failing for vmalert instance {{ $labels.instance }}" - description: "Alerting rules execution is failing for group \"{{ $labels.group }}\". + description: "Alerting rules execution is failing for group \"{{ $labels.group }}\" in file \"{{ $labels.file }}\". Check vmalert's logs for detailed error message." - alert: RecordingRulesError @@ -34,9 +34,9 @@ groups: labels: severity: warning annotations: - dashboard: "http://localhost:3000/d/LzldHAVnz?viewPanel=30&var-instance={{ $labels.instance }}&var-group={{ $labels.group }}" + dashboard: "http://localhost:3000/d/LzldHAVnz?viewPanel=30&var-instance={{ $labels.instance }}&var-file={{ $labels.file }}&var-group={{ $labels.group }}" summary: "Recording rules are failing for vmalert instance {{ $labels.instance }}" - description: "Recording rules execution is failing for group \"{{ $labels.group }}\". + description: "Recording rules execution is failing for group \"{{ $labels.group }}\" in file \"{{ $labels.file }}\". Check vmalert's logs for detailed error message." - alert: RecordingRulesNoData @@ -45,9 +45,9 @@ groups: labels: severity: info annotations: - dashboard: "http://localhost:3000/d/LzldHAVnz?viewPanel=33&var-group={{ $labels.group }}" + dashboard: "http://localhost:3000/d/LzldHAVnz?viewPanel=33&var-file={{ $labels.file }}&var-group={{ $labels.group }}" summary: "Recording rule {{ $labels.recording }} ({{ $labels.group }}) produces no data" - description: "Recording rule \"{{ $labels.recording }}\" from group \"{{ $labels.group }}\" + description: "Recording rule \"{{ $labels.recording }}\" from group \"{{ $labels.group }}\ in file \"{{ $labels.file }}\" produces 0 samples over the last 30min. It might be caused by a misconfiguration or incorrect query expression." @@ -58,7 +58,7 @@ groups: severity: warning annotations: summary: "vmalert instance {{ $labels.instance }} is missing rules evaluations" - description: "vmalert instance {{ $labels.instance }} is missing rules evaluations for group \"{{ $labels.group }}\". + description: "vmalert instance {{ $labels.instance }} is missing rules evaluations for group \"{{ $labels.group }}\" in file \"{{ $labels.file }}\". The group evaluation time takes longer than the configured evaluation interval. This may result in missed alerting notifications or recording rules samples. Try increasing evaluation interval or concurrency of group \"{{ $labels.group }}\". See https://docs.victoriametrics.com/vmalert/#groups. diff --git a/docs/changelog/CHANGELOG.md b/docs/changelog/CHANGELOG.md index b0e864d194..105f807109 100644 --- a/docs/changelog/CHANGELOG.md +++ b/docs/changelog/CHANGELOG.md @@ -24,6 +24,7 @@ See also [LTS releases](https://docs.victoriametrics.com/lts-releases/). * BUGFIX: [vmsingle](https://docs.victoriametrics.com/single-server-victoriametrics/), `vmselect` in [VictoriaMetrics cluster](https://docs.victoriametrics.com/cluster-victoriametrics/): keep the order of resulting time series when `limit_offset` is applied. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/7068). * BUGFIX: [graphite](https://docs.victoriametrics.com/#graphite-render-api-usage): properly handle xFilesFactor=0 for `transformRemoveEmptySeries` function. See [this PR](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/7337) for details. * BUGFIX: [vmauth](https://docs.victoriametrics.com/vmauth): properly check availability of all the backends before giving up when proxying requests. Previously, vmauth could return an error even if there were healthy backends available. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3061) for details. +* BUGFIX: [dashboards](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/dashboards): add `file` label filter to vmalert dashboard panels. Previously, metrics from groups with the same name but different rule files could be mixed in the results. ## [v1.106.0](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.106.0)