From 0f4bcc00b27dc23e5245d3b8d247ac7575e0c4ef Mon Sep 17 00:00:00 2001 From: Roman Khavronenko Date: Wed, 1 Sep 2021 11:46:22 +0300 Subject: [PATCH] Single dashboards upd (#1593) * dasbhoard: replace `null` datasources null datasource value may confuse Grafana and make it drop panel query in some versions. * docker: bump grafana image version * dashboards: add URL variable selector to vmagent dashboard * dashboards: add new panel `Remote write connection saturation` to vmagent dashboard * alerts: add new alert for `Remote write connection saturation` panel of vmagent dashboard * dashboards: add "Logging rate" panel to vmagent dashboard --- dashboards/victoriametrics.json | 4 +- dashboards/vmagent.json | 928 +++++++++++++++++---------- deployment/docker/alerts.yml | 13 + deployment/docker/docker-compose.yml | 2 +- 4 files changed, 597 insertions(+), 350 deletions(-) diff --git a/dashboards/victoriametrics.json b/dashboards/victoriametrics.json index 890c48f22..073882f32 100644 --- a/dashboards/victoriametrics.json +++ b/dashboards/victoriametrics.json @@ -91,7 +91,7 @@ "id": 6, "panels": [ { - "datasource": null, + "datasource": "$ds", "description": "", "gridPos": { "h": 2, @@ -373,7 +373,7 @@ "type": "stat" }, { - "datasource": null, + "datasource": "$ds", "fieldConfig": { "defaults": { "color": { diff --git a/dashboards/vmagent.json b/dashboards/vmagent.json index 7e211cf2d..2c3fe6ec8 100644 --- a/dashboards/vmagent.json +++ b/dashboards/vmagent.json @@ -5,7 +5,7 @@ "type": "grafana", "id": "grafana", "name": "Grafana", - "version": "8.0.0" + "version": "8.1.2" }, { "type": "panel", @@ -47,6 +47,12 @@ "hide": true, "iconColor": "rgba(0, 211, 255, 1)", "name": "Annotations & Alerts", + "target": { + "limit": 100, + "matchAny": false, + "tags": [], + "type": "dashboard" + }, "type": "dashboard" } ] @@ -56,7 +62,7 @@ "gnetId": null, "graphTooltip": 1, "id": null, - "iteration": 1623414948941, + "iteration": 1630485687361, "links": [ { "icon": "doc", @@ -88,10 +94,6 @@ { "collapsed": false, "datasource": "$ds", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, "gridPos": { "h": 1, "w": 24, @@ -143,7 +145,7 @@ "text": {}, "textMode": "auto" }, - "pluginVersion": "8.0.0", + "pluginVersion": "8.1.2", "targets": [ { "expr": "sum(vm_promscrape_targets{job=~\"$job\", instance=~\"$instance\", status=\"up\"})", @@ -207,7 +209,7 @@ "text": {}, "textMode": "auto" }, - "pluginVersion": "8.0.0", + "pluginVersion": "8.1.2", "targets": [ { "expr": "sum(vm_promscrape_targets{job=~\"$job\", instance=~\"$instance\", status=\"down\"})", @@ -274,7 +276,7 @@ "text": {}, "textMode": "auto" }, - "pluginVersion": "8.0.0", + "pluginVersion": "8.1.2", "targets": [ { "expr": "sum(increase(vm_log_messages_total{job=~\"$job\", instance=~\"$instance\", level!=\"info\"}[30m]))", @@ -333,7 +335,7 @@ "text": {}, "textMode": "auto" }, - "pluginVersion": "8.0.0", + "pluginVersion": "8.1.2", "targets": [ { "expr": "sum(vm_persistentqueue_bytes_pending{job=~\"$job\", instance=~\"$instance\"})", @@ -483,7 +485,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "8.0.0", + "pluginVersion": "8.1.2", "pointradius": 2, "points": false, "renderer": "flot", @@ -586,12 +588,13 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "8.0.0", + "pluginVersion": "8.1.2", "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [ { + "$$hashKey": "object:457", "alias": "out", "transform": "negative-Y" } @@ -698,7 +701,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "8.0.0", + "pluginVersion": "8.1.2", "pointradius": 2, "points": false, "renderer": "flot", @@ -798,7 +801,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "8.0.0", + "pluginVersion": "8.1.2", "pointradius": 2, "points": false, "renderer": "flot", @@ -872,7 +875,7 @@ "dashLength": 10, "dashes": false, "datasource": "$ds", - "description": "Errors rate shows rate for multiple metrics that track possible errors in vmagent, such as network or parsing errors.", + "description": "Shows the persistent queue size of pending samples in bytes which hasn't been flushed to remote storage yet. \n\nIncreasing of value might be a sign of connectivity issues. In such cases, vmagent starts to flush pending data on disk with attempt to send it later once connection is restored.\n\nRemote write URLs are hidden by default but might be unveiled once `-remoteWrite.showURL` is set to true.", "fieldConfig": { "defaults": { "links": [] @@ -888,6 +891,218 @@ "y": 16 }, "hiddenSeries": false, + "id": 17, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ + { + "title": "Troubleshooting", + "url": "https://docs.victoriametrics.com/vmagent.html#troubleshooting" + } + ], + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "8.1.2", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(vmagent_remotewrite_pending_data_bytes{job=~\"$job\", instance=~\"$instance\", url=~\"$url\"}) by (url)", + "interval": "", + "legendFormat": "{{url}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Persistent queue size ($instance) to ($url)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$ds", + "description": "Shows rate of dropped samples from persistent queue. VMagent drops samples from queue if in-memory and on-disk queues are full and it is unable to flush them to remote storage.\nThe max size of on-disk queue is configured by `-remoteWrite.maxDiskUsagePerURL` flag.", + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 24 + }, + "hiddenSeries": false, + "id": 49, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ + { + "targetBlank": true, + "title": "Troubleshooting", + "url": "https://docs.victoriametrics.com/vmagent.html#troubleshooting" + } + ], + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "8.1.2", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(increase(vm_persistentqueue_bytes_dropped_total{job=~\"$job\", instance=~\"$instance\"}[$__interval])) by (path)", + "interval": "", + "legendFormat": "{{ path }}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Persistent queue dropped rate ($instance)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$ds", + "description": "Errors rate shows rate for multiple metrics that track possible errors in vmagent, such as network or parsing errors.", + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 24 + }, + "hiddenSeries": false, "id": 69, "legend": { "alignAsTable": true, @@ -915,7 +1130,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "8.0.0", + "pluginVersion": "8.1.2", "pointradius": 2, "points": false, "renderer": "flot", @@ -996,217 +1211,6 @@ "alignLevel": null } }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$ds", - "description": "Shows rate of dropped samples from persistent queue. VMagent drops samples from queue if in-memory and on-disk queues are full and it is unable to flush them to remote storage.\nThe max size of on-disk queue is configured by `-remoteWrite.maxDiskUsagePerURL` flag.", - "fieldConfig": { - "defaults": { - "links": [] - }, - "overrides": [] - }, - "fill": 1, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 24 - }, - "hiddenSeries": false, - "id": 49, - "legend": { - "alignAsTable": true, - "avg": true, - "current": true, - "max": true, - "min": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [ - { - "targetBlank": true, - "title": "Troubleshooting", - "url": "https://docs.victoriametrics.com/vmagent.html#troubleshooting" - } - ], - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "8.0.0", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(increase(vm_persistentqueue_bytes_dropped_total{job=~\"$job\", instance=~\"$instance\"}[$__interval])) by (path)", - "interval": "", - "legendFormat": "{{ path }}", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Persistent queue dropped rate ($instance)", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$ds", - "description": "Shows the persistent queue size of pending samples in bytes which hasn't been flushed to remote storage yet. \n\nIncreasing of value might be a sign of connectivity issues. In such cases, vmagent starts to flush pending data on disk with attempt to send it later once connection is restored.\n\nRemote write URLs are hidden by default but might be unveiled once `-remoteWrite.showURL` is set to true.", - "fieldConfig": { - "defaults": { - "links": [] - }, - "overrides": [] - }, - "fill": 1, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 24 - }, - "hiddenSeries": false, - "id": 17, - "legend": { - "alignAsTable": true, - "avg": true, - "current": true, - "max": true, - "min": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [ - { - "title": "Troubleshooting", - "url": "https://docs.victoriametrics.com/vmagent.html#troubleshooting" - } - ], - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "8.0.0", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(vmagent_remotewrite_pending_data_bytes{job=~\"$job\", instance=~\"$instance\"}) by (url)", - "interval": "", - "legendFormat": "{{url}}", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Persistent queue size ($instance)", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, { "aliasColors": {}, "bars": false, @@ -1256,7 +1260,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "8.0.0", + "pluginVersion": "8.1.2", "pointradius": 2, "points": false, "renderer": "flot", @@ -1266,7 +1270,8 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(vmagent_remotewrite_global_relabel_metrics_dropped_total{job=~\"$job\", instance=~\"$instance\"}[$__interval]))", + "exemplar": true, + "expr": "sum(rate(vmagent_remotewrite_global_relabel_metrics_dropped_total{job=~\"$job\", instance=~\"$instance\", url=~\"$url\"}[$__interval]))", "interval": "", "legendFormat": "global", "refId": "A" @@ -1282,7 +1287,7 @@ "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "Rows dropped by relabeling ($instance)", + "title": "Rows dropped by relabeling ($instance) to ($url)", "tooltip": { "shared": true, "sort": 0, @@ -1362,7 +1367,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "8.0.0", + "pluginVersion": "8.1.2", "pointradius": 2, "points": false, "renderer": "flot", @@ -1373,9 +1378,9 @@ "targets": [ { "exemplar": true, - "expr": "sum(rate(vmagent_remotewrite_packets_dropped_total{job=~\"$job\", instance=~\"$instance\"}[$__interval]))", + "expr": "sum(rate(vmagent_remotewrite_packets_dropped_total{job=~\"$job\", instance=~\"$instance\", url=~\"$url\"}[$__interval])) by(url)", "interval": "", - "legendFormat": "dropped", + "legendFormat": "{{url}}", "refId": "A" } ], @@ -1383,7 +1388,7 @@ "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "Data blocks dropped ($instance)", + "title": "Data blocks dropped ($instance) to ($url)", "tooltip": { "shared": true, "sort": 0, @@ -1421,17 +1426,118 @@ } }, { - "collapsed": true, + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, "datasource": "$ds", + "description": "Shows the rate of logging the messages by their level. Unexpected spike in rate is a good reason to check logs.", "fieldConfig": { - "defaults": {}, + "defaults": { + "links": [] + }, "overrides": [] }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 40 + }, + "hiddenSeries": false, + "id": 86, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "8.1.2", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(rate(vm_log_messages_total{job=\"$job\", instance=~\"$instance\"}[5m])) by (level) ", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{level}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Logging rate ($instance)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": null, + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "collapsed": true, + "datasource": "$ds", "gridPos": { "h": 1, "w": 24, "x": 0, - "y": 40 + "y": 48 }, "id": 28, "panels": [ @@ -1453,7 +1559,7 @@ "h": 7, "w": 12, "x": 0, - "y": 2 + "y": 41 }, "hiddenSeries": false, "id": 48, @@ -1476,7 +1582,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "8.0.0", + "pluginVersion": "8.1.2", "pointradius": 2, "points": false, "renderer": "flot", @@ -1553,7 +1659,7 @@ "h": 7, "w": 12, "x": 12, - "y": 2 + "y": 41 }, "hiddenSeries": false, "id": 76, @@ -1576,7 +1682,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "8.0.0", + "pluginVersion": "8.1.2", "pointradius": 2, "points": false, "renderer": "flot", @@ -1653,7 +1759,7 @@ "h": 8, "w": 12, "x": 0, - "y": 9 + "y": 48 }, "hiddenSeries": false, "id": 20, @@ -1676,7 +1782,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "8.0.0", + "pluginVersion": "8.1.2", "pointradius": 2, "points": false, "renderer": "flot", @@ -1762,7 +1868,7 @@ "h": 8, "w": 12, "x": 12, - "y": 9 + "y": 48 }, "hiddenSeries": false, "id": 31, @@ -1785,7 +1891,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "8.0.0", + "pluginVersion": "8.1.2", "pointradius": 2, "points": false, "renderer": "flot", @@ -1878,7 +1984,7 @@ "h": 8, "w": 12, "x": 0, - "y": 17 + "y": 56 }, "hiddenSeries": false, "id": 46, @@ -1901,7 +2007,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "8.0.0", + "pluginVersion": "8.1.2", "pointradius": 2, "points": false, "renderer": "flot", @@ -1984,7 +2090,7 @@ "h": 8, "w": 12, "x": 12, - "y": 17 + "y": 56 }, "heatmap": {}, "hideZeroBuckets": false, @@ -2037,15 +2143,11 @@ { "collapsed": true, "datasource": "$ds", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, "gridPos": { "h": 1, "w": 24, "x": 0, - "y": 41 + "y": 49 }, "id": 71, "panels": [ @@ -2068,7 +2170,7 @@ "h": 8, "w": 12, "x": 0, - "y": 3 + "y": 42 }, "hiddenSeries": false, "id": 73, @@ -2092,7 +2194,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "8.0.0", + "pluginVersion": "8.1.2", "pointradius": 2, "points": false, "renderer": "flot", @@ -2176,7 +2278,7 @@ "h": 8, "w": 12, "x": 12, - "y": 3 + "y": 42 }, "hiddenSeries": false, "id": 77, @@ -2200,7 +2302,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "8.0.0", + "pluginVersion": "8.1.2", "pointradius": 2, "points": false, "renderer": "flot", @@ -2284,7 +2386,7 @@ "h": 8, "w": 12, "x": 0, - "y": 11 + "y": 50 }, "hiddenSeries": false, "id": 78, @@ -2308,7 +2410,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "8.0.0", + "pluginVersion": "8.1.2", "pointradius": 2, "points": false, "renderer": "flot", @@ -2385,7 +2487,7 @@ "h": 8, "w": 12, "x": 12, - "y": 11 + "y": 50 }, "hiddenSeries": false, "id": 50, @@ -2408,7 +2510,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "8.0.0", + "pluginVersion": "8.1.2", "pointradius": 2, "points": false, "renderer": "flot", @@ -2473,15 +2575,11 @@ { "collapsed": true, "datasource": "$ds", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, "gridPos": { "h": 1, "w": 24, "x": 0, - "y": 42 + "y": 50 }, "id": 58, "panels": [ @@ -2527,7 +2625,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "8.0.0", + "pluginVersion": "8.1.2", "pointradius": 2, "points": false, "renderer": "flot", @@ -2537,7 +2635,8 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(vmagent_remotewrite_requests_total{job=~\"$job\", instance=~\"$instance\"}[$__interval])) by(url, status_code)", + "exemplar": true, + "expr": "sum(rate(vmagent_remotewrite_requests_total{job=~\"$job\", instance=~\"$instance\", url=~\"$url\"}[$__interval])) by(url, status_code)", "interval": "", "legendFormat": "", "refId": "A" @@ -2547,7 +2646,7 @@ "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "Requests rate ($instance)", + "title": "Requests rate ($instance) to ($url)", "tooltip": { "shared": true, "sort": 0, @@ -2627,7 +2726,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "8.0.0", + "pluginVersion": "8.1.2", "pointradius": 2, "points": false, "renderer": "flot", @@ -2637,6 +2736,7 @@ "steppedLine": false, "targets": [ { + "exemplar": true, "expr": "sum(rate(vmagent_remotewrite_conn_bytes_written_total{job=~\"$job\", instance=~\"$instance\"}[$__interval])) by(instance)", "interval": "", "legendFormat": "", @@ -2726,7 +2826,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "8.0.0", + "pluginVersion": "8.1.2", "pointradius": 2, "points": false, "renderer": "flot", @@ -2737,7 +2837,7 @@ "targets": [ { "exemplar": true, - "expr": "sum(rate(vmagent_remotewrite_retries_count_total{job=~\"$job\", instance=~\"$instance\"}[$__interval])) by(url)", + "expr": "sum(rate(vmagent_remotewrite_retries_count_total{job=~\"$job\", instance=~\"$instance\", url=~\"$url\"}[$__interval])) by(url)", "interval": "", "legendFormat": "", "refId": "A" @@ -2747,7 +2847,7 @@ "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "Retry rate ($instance)", + "title": "Retry rate ($instance) to ($url)", "tooltip": { "shared": true, "sort": 0, @@ -2826,7 +2926,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "8.0.0", + "pluginVersion": "8.1.2", "pointradius": 2, "points": false, "renderer": "flot", @@ -2836,6 +2936,7 @@ "steppedLine": false, "targets": [ { + "exemplar": true, "expr": "sum(vmagent_remotewrite_conns{job=~\"$job\", instance=~\"$instance\"}) by (instance)", "interval": "", "legendFormat": "", @@ -2897,7 +2998,7 @@ }, "dataFormat": "tsbuckets", "datasource": "$ds", - "description": "Shows the remote write request block size distribution in rows.", + "description": "Shows the remote write request duration distribution in seconds. Value depends on block size, network quality and remote storage performance.", "gridPos": { "h": 8, "w": 12, @@ -2907,6 +3008,184 @@ "heatmap": {}, "hideZeroBuckets": false, "highlightCards": true, + "id": 30, + "legend": { + "show": false + }, + "reverseYBuckets": false, + "targets": [ + { + "exemplar": true, + "expr": "buckets_limit(12, prometheus_buckets(sum(rate(vmagent_remotewrite_duration_seconds_bucket{job=~\"$job\", instance=~\"$instance\", url=~\"$url\"}[$__interval])) by(vmrange)))", + "format": "heatmap", + "interval": "", + "intervalFactor": 10, + "legendFormat": "{{le}}", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Push duration ($instance) to ($url)", + "tooltip": { + "show": true, + "showHistogram": false + }, + "type": "heatmap", + "xAxis": { + "show": true + }, + "xBucketNumber": null, + "xBucketSize": null, + "yAxis": { + "decimals": 2, + "format": "s", + "logBase": 1, + "max": null, + "min": null, + "show": true, + "splitFactor": null + }, + "yBucketBound": "auto", + "yBucketNumber": null, + "yBucketSize": null + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$ds", + "description": "Shows saturation of every connection to remote storage. If the threshold of 0.9sec is reached, then the connection is saturated by more than 90% and vmagent won't be able to keep up. This usually means that `-remoteWrite.queues` command-line flag must be increased in order to increase the number of connections per each remote storage.\n", + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 20 + }, + "hiddenSeries": false, + "id": 84, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "8.1.2", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(rate(vmagent_remotewrite_send_duration_seconds_total{job=~\"$job\", instance=~\"$instance\", url=~\"$url\"}[$__rate_interval])) by (instance, url)", + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "thresholds": [ + { + "$$hashKey": "object:683", + "colorMode": "critical", + "fill": true, + "line": true, + "op": "gt", + "value": 0.9, + "yaxis": "left" + } + ], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Remote write connection saturation ($instance)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:662", + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:663", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "cards": { + "cardPadding": null, + "cardRound": null + }, + "color": { + "cardColor": "#b4ff00", + "colorScale": "sqrt", + "colorScheme": "interpolateOranges", + "exponent": 0.5, + "mode": "spectrum" + }, + "dataFormat": "tsbuckets", + "datasource": "$ds", + "description": "Shows the remote write request block size distribution in rows.", + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 28 + }, + "heatmap": {}, + "hideZeroBuckets": false, + "highlightCards": true, "id": 63, "legend": { "show": false @@ -2914,6 +3193,7 @@ "reverseYBuckets": false, "targets": [ { + "exemplar": true, "expr": "buckets_limit(12, prometheus_buckets(sum(rate(vmagent_remotewrite_block_size_rows_bucket{job=~\"$job\", instance=~\"$instance\"}[$__interval])) by(vmrange)))", "format": "heatmap", "interval": "", @@ -2967,7 +3247,7 @@ "h": 8, "w": 12, "x": 12, - "y": 20 + "y": 28 }, "heatmap": {}, "hideZeroBuckets": false, @@ -3012,71 +3292,6 @@ "yBucketBound": "auto", "yBucketNumber": null, "yBucketSize": null - }, - { - "cards": { - "cardPadding": null, - "cardRound": null - }, - "color": { - "cardColor": "#b4ff00", - "colorScale": "sqrt", - "colorScheme": "interpolateOranges", - "exponent": 0.5, - "mode": "spectrum" - }, - "dataFormat": "tsbuckets", - "datasource": "$ds", - "description": "Shows the remote write request duration distribution in seconds. Value depends on block size, network quality and remote storage performance.", - "gridPos": { - "h": 8, - "w": 24, - "x": 0, - "y": 28 - }, - "heatmap": {}, - "hideZeroBuckets": false, - "highlightCards": true, - "id": 30, - "legend": { - "show": false - }, - "reverseYBuckets": false, - "targets": [ - { - "expr": "buckets_limit(12, prometheus_buckets(sum(rate(vmagent_remotewrite_duration_seconds_bucket{job=~\"$job\", instance=~\"$instance\"}[$__interval])) by(vmrange)))", - "format": "heatmap", - "interval": "", - "intervalFactor": 10, - "legendFormat": "{{le}}", - "refId": "A" - } - ], - "timeFrom": null, - "timeShift": null, - "title": "Push duration ($instance)", - "tooltip": { - "show": true, - "showHistogram": false - }, - "type": "heatmap", - "xAxis": { - "show": true - }, - "xBucketNumber": null, - "xBucketSize": null, - "yAxis": { - "decimals": 2, - "format": "s", - "logBase": 1, - "max": null, - "min": null, - "show": true, - "splitFactor": null - }, - "yBucketBound": "auto", - "yBucketNumber": null, - "yBucketSize": null } ], "title": "Remote write", @@ -3085,15 +3300,11 @@ { "collapsed": true, "datasource": "$ds", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, "gridPos": { "h": 1, "w": 24, "x": 0, - "y": 43 + "y": 51 }, "id": 45, "panels": [ @@ -3942,11 +4153,34 @@ "tagsQuery": "", "type": "query", "useTags": false + }, + { + "allValue": ".*", + "current": {}, + "datasource": "$ds", + "definition": "label_values(vmagent_remotewrite_requests_total{job=\"$job\", instance=~\"$instance\"}, url)", + "description": "The remote write URLs", + "error": null, + "hide": 0, + "includeAll": true, + "label": null, + "multi": true, + "name": "url", + "options": [], + "query": { + "query": "label_values(vmagent_remotewrite_requests_total{job=\"$job\", instance=~\"$instance\"}, url)", + "refId": "StandardVariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "type": "query" } ] }, "time": { - "from": "now-1h", + "from": "now-30m", "to": "now" }, "timepicker": { diff --git a/deployment/docker/alerts.yml b/deployment/docker/alerts.yml index bc8780e79..3e3132d38 100644 --- a/deployment/docker/alerts.yml +++ b/deployment/docker/alerts.yml @@ -248,3 +248,16 @@ groups: description: "Vmagent fails to push data via remote write protocol to destination \"{{ $labels.url }}\"\n Ensure that destination is up and reachable." + - alert: RemoteWriteConnectionIsSaturated + expr: rate(vmagent_remotewrite_send_duration_seconds_total[5m]) > 0.9 + for: 15m + labels: + severity: warning + annotations: + dashboard: "http://localhost:3000/d/G7Z9GzMGz?viewPanel=84&var-instance={{ $labels.instance }}" + summary: "Remote write connection from \"{{ $labels.job }}\" (instance {{ $labels.instance }}) to {{ $labels.url }} is saturated" + description: "The remote write connection between vmagent \"{{ $labels.job }}\" (instance {{ $labels.instance }}) and destination \"{{ $labels.url }}\" + is saturated by more than 90% and vmagent won't be able to keep up.\n + This usually means that `-remoteWrite.queues` command-line flag must be increased in order to increase + the number of connections per each remote storage." + diff --git a/deployment/docker/docker-compose.yml b/deployment/docker/docker-compose.yml index e52696caf..fbbd679bf 100644 --- a/deployment/docker/docker-compose.yml +++ b/deployment/docker/docker-compose.yml @@ -39,7 +39,7 @@ services: restart: always grafana: container_name: grafana - image: grafana/grafana:8.0.0 + image: grafana/grafana:8.1.2 depends_on: - "victoriametrics" ports: