From af8c1feddb3bb5d4fa4bf48237ccbe273bf3635d Mon Sep 17 00:00:00 2001 From: Roman Khavronenko Date: Wed, 1 Sep 2021 11:46:22 +0300 Subject: [PATCH] Single dashboards upd (#1593) * dasbhoard: replace `null` datasources null datasource value may confuse Grafana and make it drop panel query in some versions. * docker: bump grafana image version * dashboards: add URL variable selector to vmagent dashboard * dashboards: add new panel `Remote write connection saturation` to vmagent dashboard * alerts: add new alert for `Remote write connection saturation` panel of vmagent dashboard * dashboards: add "Logging rate" panel to vmagent dashboard --- dashboards/vmagent.json | 928 +++++++++++++++++---------- deployment/docker/alerts.yml | 13 + deployment/docker/docker-compose.yml | 2 +- 3 files changed, 595 insertions(+), 348 deletions(-) diff --git a/dashboards/vmagent.json b/dashboards/vmagent.json index 7e211cf2de..2c3fe6ec89 100644 --- a/dashboards/vmagent.json +++ b/dashboards/vmagent.json @@ -5,7 +5,7 @@ "type": "grafana", "id": "grafana", "name": "Grafana", - "version": "8.0.0" + "version": "8.1.2" }, { "type": "panel", @@ -47,6 +47,12 @@ "hide": true, "iconColor": "rgba(0, 211, 255, 1)", "name": "Annotations & Alerts", + "target": { + "limit": 100, + "matchAny": false, + "tags": [], + "type": "dashboard" + }, "type": "dashboard" } ] @@ -56,7 +62,7 @@ "gnetId": null, "graphTooltip": 1, "id": null, - "iteration": 1623414948941, + "iteration": 1630485687361, "links": [ { "icon": "doc", @@ -88,10 +94,6 @@ { "collapsed": false, "datasource": "$ds", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, "gridPos": { "h": 1, "w": 24, @@ -143,7 +145,7 @@ "text": {}, "textMode": "auto" }, - "pluginVersion": "8.0.0", + "pluginVersion": "8.1.2", "targets": [ { "expr": "sum(vm_promscrape_targets{job=~\"$job\", instance=~\"$instance\", status=\"up\"})", @@ -207,7 +209,7 @@ "text": {}, "textMode": "auto" }, - "pluginVersion": "8.0.0", + "pluginVersion": "8.1.2", "targets": [ { "expr": "sum(vm_promscrape_targets{job=~\"$job\", instance=~\"$instance\", status=\"down\"})", @@ -274,7 +276,7 @@ "text": {}, "textMode": "auto" }, - "pluginVersion": "8.0.0", + "pluginVersion": "8.1.2", "targets": [ { "expr": "sum(increase(vm_log_messages_total{job=~\"$job\", instance=~\"$instance\", level!=\"info\"}[30m]))", @@ -333,7 +335,7 @@ "text": {}, "textMode": "auto" }, - "pluginVersion": "8.0.0", + "pluginVersion": "8.1.2", "targets": [ { "expr": "sum(vm_persistentqueue_bytes_pending{job=~\"$job\", instance=~\"$instance\"})", @@ -483,7 +485,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "8.0.0", + "pluginVersion": "8.1.2", "pointradius": 2, "points": false, "renderer": "flot", @@ -586,12 +588,13 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "8.0.0", + "pluginVersion": "8.1.2", "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [ { + "$$hashKey": "object:457", "alias": "out", "transform": "negative-Y" } @@ -698,7 +701,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "8.0.0", + "pluginVersion": "8.1.2", "pointradius": 2, "points": false, "renderer": "flot", @@ -798,7 +801,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "8.0.0", + "pluginVersion": "8.1.2", "pointradius": 2, "points": false, "renderer": "flot", @@ -872,7 +875,7 @@ "dashLength": 10, "dashes": false, "datasource": "$ds", - "description": "Errors rate shows rate for multiple metrics that track possible errors in vmagent, such as network or parsing errors.", + "description": "Shows the persistent queue size of pending samples in bytes which hasn't been flushed to remote storage yet. \n\nIncreasing of value might be a sign of connectivity issues. In such cases, vmagent starts to flush pending data on disk with attempt to send it later once connection is restored.\n\nRemote write URLs are hidden by default but might be unveiled once `-remoteWrite.showURL` is set to true.", "fieldConfig": { "defaults": { "links": [] @@ -888,6 +891,218 @@ "y": 16 }, "hiddenSeries": false, + "id": 17, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ + { + "title": "Troubleshooting", + "url": "https://docs.victoriametrics.com/vmagent.html#troubleshooting" + } + ], + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "8.1.2", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(vmagent_remotewrite_pending_data_bytes{job=~\"$job\", instance=~\"$instance\", url=~\"$url\"}) by (url)", + "interval": "", + "legendFormat": "{{url}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Persistent queue size ($instance) to ($url)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$ds", + "description": "Shows rate of dropped samples from persistent queue. VMagent drops samples from queue if in-memory and on-disk queues are full and it is unable to flush them to remote storage.\nThe max size of on-disk queue is configured by `-remoteWrite.maxDiskUsagePerURL` flag.", + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 24 + }, + "hiddenSeries": false, + "id": 49, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ + { + "targetBlank": true, + "title": "Troubleshooting", + "url": "https://docs.victoriametrics.com/vmagent.html#troubleshooting" + } + ], + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "8.1.2", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(increase(vm_persistentqueue_bytes_dropped_total{job=~\"$job\", instance=~\"$instance\"}[$__interval])) by (path)", + "interval": "", + "legendFormat": "{{ path }}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Persistent queue dropped rate ($instance)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$ds", + "description": "Errors rate shows rate for multiple metrics that track possible errors in vmagent, such as network or parsing errors.", + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 24 + }, + "hiddenSeries": false, "id": 69, "legend": { "alignAsTable": true, @@ -915,7 +1130,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "8.0.0", + "pluginVersion": "8.1.2", "pointradius": 2, "points": false, "renderer": "flot", @@ -996,217 +1211,6 @@ "alignLevel": null } }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$ds", - "description": "Shows rate of dropped samples from persistent queue. VMagent drops samples from queue if in-memory and on-disk queues are full and it is unable to flush them to remote storage.\nThe max size of on-disk queue is configured by `-remoteWrite.maxDiskUsagePerURL` flag.", - "fieldConfig": { - "defaults": { - "links": [] - }, - "overrides": [] - }, - "fill": 1, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 24 - }, - "hiddenSeries": false, - "id": 49, - "legend": { - "alignAsTable": true, - "avg": true, - "current": true, - "max": true, - "min": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [ - { - "targetBlank": true, - "title": "Troubleshooting", - "url": "https://docs.victoriametrics.com/vmagent.html#troubleshooting" - } - ], - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "8.0.0", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(increase(vm_persistentqueue_bytes_dropped_total{job=~\"$job\", instance=~\"$instance\"}[$__interval])) by (path)", - "interval": "", - "legendFormat": "{{ path }}", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Persistent queue dropped rate ($instance)", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$ds", - "description": "Shows the persistent queue size of pending samples in bytes which hasn't been flushed to remote storage yet. \n\nIncreasing of value might be a sign of connectivity issues. In such cases, vmagent starts to flush pending data on disk with attempt to send it later once connection is restored.\n\nRemote write URLs are hidden by default but might be unveiled once `-remoteWrite.showURL` is set to true.", - "fieldConfig": { - "defaults": { - "links": [] - }, - "overrides": [] - }, - "fill": 1, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 24 - }, - "hiddenSeries": false, - "id": 17, - "legend": { - "alignAsTable": true, - "avg": true, - "current": true, - "max": true, - "min": false, - "show": true, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [ - { - "title": "Troubleshooting", - "url": "https://docs.victoriametrics.com/vmagent.html#troubleshooting" - } - ], - "nullPointMode": "null", - "options": { - "alertThreshold": true - }, - "percentage": false, - "pluginVersion": "8.0.0", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(vmagent_remotewrite_pending_data_bytes{job=~\"$job\", instance=~\"$instance\"}) by (url)", - "interval": "", - "legendFormat": "{{url}}", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Persistent queue size ($instance)", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, { "aliasColors": {}, "bars": false, @@ -1256,7 +1260,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "8.0.0", + "pluginVersion": "8.1.2", "pointradius": 2, "points": false, "renderer": "flot", @@ -1266,7 +1270,8 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(vmagent_remotewrite_global_relabel_metrics_dropped_total{job=~\"$job\", instance=~\"$instance\"}[$__interval]))", + "exemplar": true, + "expr": "sum(rate(vmagent_remotewrite_global_relabel_metrics_dropped_total{job=~\"$job\", instance=~\"$instance\", url=~\"$url\"}[$__interval]))", "interval": "", "legendFormat": "global", "refId": "A" @@ -1282,7 +1287,7 @@ "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "Rows dropped by relabeling ($instance)", + "title": "Rows dropped by relabeling ($instance) to ($url)", "tooltip": { "shared": true, "sort": 0, @@ -1362,7 +1367,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "8.0.0", + "pluginVersion": "8.1.2", "pointradius": 2, "points": false, "renderer": "flot", @@ -1373,9 +1378,9 @@ "targets": [ { "exemplar": true, - "expr": "sum(rate(vmagent_remotewrite_packets_dropped_total{job=~\"$job\", instance=~\"$instance\"}[$__interval]))", + "expr": "sum(rate(vmagent_remotewrite_packets_dropped_total{job=~\"$job\", instance=~\"$instance\", url=~\"$url\"}[$__interval])) by(url)", "interval": "", - "legendFormat": "dropped", + "legendFormat": "{{url}}", "refId": "A" } ], @@ -1383,7 +1388,7 @@ "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "Data blocks dropped ($instance)", + "title": "Data blocks dropped ($instance) to ($url)", "tooltip": { "shared": true, "sort": 0, @@ -1421,17 +1426,118 @@ } }, { - "collapsed": true, + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, "datasource": "$ds", + "description": "Shows the rate of logging the messages by their level. Unexpected spike in rate is a good reason to check logs.", "fieldConfig": { - "defaults": {}, + "defaults": { + "links": [] + }, "overrides": [] }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 40 + }, + "hiddenSeries": false, + "id": 86, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "8.1.2", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(rate(vm_log_messages_total{job=\"$job\", instance=~\"$instance\"}[5m])) by (level) ", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{level}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Logging rate ($instance)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": null, + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "collapsed": true, + "datasource": "$ds", "gridPos": { "h": 1, "w": 24, "x": 0, - "y": 40 + "y": 48 }, "id": 28, "panels": [ @@ -1453,7 +1559,7 @@ "h": 7, "w": 12, "x": 0, - "y": 2 + "y": 41 }, "hiddenSeries": false, "id": 48, @@ -1476,7 +1582,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "8.0.0", + "pluginVersion": "8.1.2", "pointradius": 2, "points": false, "renderer": "flot", @@ -1553,7 +1659,7 @@ "h": 7, "w": 12, "x": 12, - "y": 2 + "y": 41 }, "hiddenSeries": false, "id": 76, @@ -1576,7 +1682,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "8.0.0", + "pluginVersion": "8.1.2", "pointradius": 2, "points": false, "renderer": "flot", @@ -1653,7 +1759,7 @@ "h": 8, "w": 12, "x": 0, - "y": 9 + "y": 48 }, "hiddenSeries": false, "id": 20, @@ -1676,7 +1782,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "8.0.0", + "pluginVersion": "8.1.2", "pointradius": 2, "points": false, "renderer": "flot", @@ -1762,7 +1868,7 @@ "h": 8, "w": 12, "x": 12, - "y": 9 + "y": 48 }, "hiddenSeries": false, "id": 31, @@ -1785,7 +1891,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "8.0.0", + "pluginVersion": "8.1.2", "pointradius": 2, "points": false, "renderer": "flot", @@ -1878,7 +1984,7 @@ "h": 8, "w": 12, "x": 0, - "y": 17 + "y": 56 }, "hiddenSeries": false, "id": 46, @@ -1901,7 +2007,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "8.0.0", + "pluginVersion": "8.1.2", "pointradius": 2, "points": false, "renderer": "flot", @@ -1984,7 +2090,7 @@ "h": 8, "w": 12, "x": 12, - "y": 17 + "y": 56 }, "heatmap": {}, "hideZeroBuckets": false, @@ -2037,15 +2143,11 @@ { "collapsed": true, "datasource": "$ds", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, "gridPos": { "h": 1, "w": 24, "x": 0, - "y": 41 + "y": 49 }, "id": 71, "panels": [ @@ -2068,7 +2170,7 @@ "h": 8, "w": 12, "x": 0, - "y": 3 + "y": 42 }, "hiddenSeries": false, "id": 73, @@ -2092,7 +2194,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "8.0.0", + "pluginVersion": "8.1.2", "pointradius": 2, "points": false, "renderer": "flot", @@ -2176,7 +2278,7 @@ "h": 8, "w": 12, "x": 12, - "y": 3 + "y": 42 }, "hiddenSeries": false, "id": 77, @@ -2200,7 +2302,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "8.0.0", + "pluginVersion": "8.1.2", "pointradius": 2, "points": false, "renderer": "flot", @@ -2284,7 +2386,7 @@ "h": 8, "w": 12, "x": 0, - "y": 11 + "y": 50 }, "hiddenSeries": false, "id": 78, @@ -2308,7 +2410,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "8.0.0", + "pluginVersion": "8.1.2", "pointradius": 2, "points": false, "renderer": "flot", @@ -2385,7 +2487,7 @@ "h": 8, "w": 12, "x": 12, - "y": 11 + "y": 50 }, "hiddenSeries": false, "id": 50, @@ -2408,7 +2510,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "8.0.0", + "pluginVersion": "8.1.2", "pointradius": 2, "points": false, "renderer": "flot", @@ -2473,15 +2575,11 @@ { "collapsed": true, "datasource": "$ds", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, "gridPos": { "h": 1, "w": 24, "x": 0, - "y": 42 + "y": 50 }, "id": 58, "panels": [ @@ -2527,7 +2625,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "8.0.0", + "pluginVersion": "8.1.2", "pointradius": 2, "points": false, "renderer": "flot", @@ -2537,7 +2635,8 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(vmagent_remotewrite_requests_total{job=~\"$job\", instance=~\"$instance\"}[$__interval])) by(url, status_code)", + "exemplar": true, + "expr": "sum(rate(vmagent_remotewrite_requests_total{job=~\"$job\", instance=~\"$instance\", url=~\"$url\"}[$__interval])) by(url, status_code)", "interval": "", "legendFormat": "", "refId": "A" @@ -2547,7 +2646,7 @@ "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "Requests rate ($instance)", + "title": "Requests rate ($instance) to ($url)", "tooltip": { "shared": true, "sort": 0, @@ -2627,7 +2726,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "8.0.0", + "pluginVersion": "8.1.2", "pointradius": 2, "points": false, "renderer": "flot", @@ -2637,6 +2736,7 @@ "steppedLine": false, "targets": [ { + "exemplar": true, "expr": "sum(rate(vmagent_remotewrite_conn_bytes_written_total{job=~\"$job\", instance=~\"$instance\"}[$__interval])) by(instance)", "interval": "", "legendFormat": "", @@ -2726,7 +2826,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "8.0.0", + "pluginVersion": "8.1.2", "pointradius": 2, "points": false, "renderer": "flot", @@ -2737,7 +2837,7 @@ "targets": [ { "exemplar": true, - "expr": "sum(rate(vmagent_remotewrite_retries_count_total{job=~\"$job\", instance=~\"$instance\"}[$__interval])) by(url)", + "expr": "sum(rate(vmagent_remotewrite_retries_count_total{job=~\"$job\", instance=~\"$instance\", url=~\"$url\"}[$__interval])) by(url)", "interval": "", "legendFormat": "", "refId": "A" @@ -2747,7 +2847,7 @@ "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "Retry rate ($instance)", + "title": "Retry rate ($instance) to ($url)", "tooltip": { "shared": true, "sort": 0, @@ -2826,7 +2926,7 @@ "alertThreshold": true }, "percentage": false, - "pluginVersion": "8.0.0", + "pluginVersion": "8.1.2", "pointradius": 2, "points": false, "renderer": "flot", @@ -2836,6 +2936,7 @@ "steppedLine": false, "targets": [ { + "exemplar": true, "expr": "sum(vmagent_remotewrite_conns{job=~\"$job\", instance=~\"$instance\"}) by (instance)", "interval": "", "legendFormat": "", @@ -2897,7 +2998,7 @@ }, "dataFormat": "tsbuckets", "datasource": "$ds", - "description": "Shows the remote write request block size distribution in rows.", + "description": "Shows the remote write request duration distribution in seconds. Value depends on block size, network quality and remote storage performance.", "gridPos": { "h": 8, "w": 12, @@ -2907,6 +3008,184 @@ "heatmap": {}, "hideZeroBuckets": false, "highlightCards": true, + "id": 30, + "legend": { + "show": false + }, + "reverseYBuckets": false, + "targets": [ + { + "exemplar": true, + "expr": "buckets_limit(12, prometheus_buckets(sum(rate(vmagent_remotewrite_duration_seconds_bucket{job=~\"$job\", instance=~\"$instance\", url=~\"$url\"}[$__interval])) by(vmrange)))", + "format": "heatmap", + "interval": "", + "intervalFactor": 10, + "legendFormat": "{{le}}", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Push duration ($instance) to ($url)", + "tooltip": { + "show": true, + "showHistogram": false + }, + "type": "heatmap", + "xAxis": { + "show": true + }, + "xBucketNumber": null, + "xBucketSize": null, + "yAxis": { + "decimals": 2, + "format": "s", + "logBase": 1, + "max": null, + "min": null, + "show": true, + "splitFactor": null + }, + "yBucketBound": "auto", + "yBucketNumber": null, + "yBucketSize": null + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$ds", + "description": "Shows saturation of every connection to remote storage. If the threshold of 0.9sec is reached, then the connection is saturated by more than 90% and vmagent won't be able to keep up. This usually means that `-remoteWrite.queues` command-line flag must be increased in order to increase the number of connections per each remote storage.\n", + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 20 + }, + "hiddenSeries": false, + "id": 84, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": false, + "show": true, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "8.1.2", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(rate(vmagent_remotewrite_send_duration_seconds_total{job=~\"$job\", instance=~\"$instance\", url=~\"$url\"}[$__rate_interval])) by (instance, url)", + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "thresholds": [ + { + "$$hashKey": "object:683", + "colorMode": "critical", + "fill": true, + "line": true, + "op": "gt", + "value": 0.9, + "yaxis": "left" + } + ], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Remote write connection saturation ($instance)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:662", + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:663", + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "cards": { + "cardPadding": null, + "cardRound": null + }, + "color": { + "cardColor": "#b4ff00", + "colorScale": "sqrt", + "colorScheme": "interpolateOranges", + "exponent": 0.5, + "mode": "spectrum" + }, + "dataFormat": "tsbuckets", + "datasource": "$ds", + "description": "Shows the remote write request block size distribution in rows.", + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 28 + }, + "heatmap": {}, + "hideZeroBuckets": false, + "highlightCards": true, "id": 63, "legend": { "show": false @@ -2914,6 +3193,7 @@ "reverseYBuckets": false, "targets": [ { + "exemplar": true, "expr": "buckets_limit(12, prometheus_buckets(sum(rate(vmagent_remotewrite_block_size_rows_bucket{job=~\"$job\", instance=~\"$instance\"}[$__interval])) by(vmrange)))", "format": "heatmap", "interval": "", @@ -2967,7 +3247,7 @@ "h": 8, "w": 12, "x": 12, - "y": 20 + "y": 28 }, "heatmap": {}, "hideZeroBuckets": false, @@ -3012,71 +3292,6 @@ "yBucketBound": "auto", "yBucketNumber": null, "yBucketSize": null - }, - { - "cards": { - "cardPadding": null, - "cardRound": null - }, - "color": { - "cardColor": "#b4ff00", - "colorScale": "sqrt", - "colorScheme": "interpolateOranges", - "exponent": 0.5, - "mode": "spectrum" - }, - "dataFormat": "tsbuckets", - "datasource": "$ds", - "description": "Shows the remote write request duration distribution in seconds. Value depends on block size, network quality and remote storage performance.", - "gridPos": { - "h": 8, - "w": 24, - "x": 0, - "y": 28 - }, - "heatmap": {}, - "hideZeroBuckets": false, - "highlightCards": true, - "id": 30, - "legend": { - "show": false - }, - "reverseYBuckets": false, - "targets": [ - { - "expr": "buckets_limit(12, prometheus_buckets(sum(rate(vmagent_remotewrite_duration_seconds_bucket{job=~\"$job\", instance=~\"$instance\"}[$__interval])) by(vmrange)))", - "format": "heatmap", - "interval": "", - "intervalFactor": 10, - "legendFormat": "{{le}}", - "refId": "A" - } - ], - "timeFrom": null, - "timeShift": null, - "title": "Push duration ($instance)", - "tooltip": { - "show": true, - "showHistogram": false - }, - "type": "heatmap", - "xAxis": { - "show": true - }, - "xBucketNumber": null, - "xBucketSize": null, - "yAxis": { - "decimals": 2, - "format": "s", - "logBase": 1, - "max": null, - "min": null, - "show": true, - "splitFactor": null - }, - "yBucketBound": "auto", - "yBucketNumber": null, - "yBucketSize": null } ], "title": "Remote write", @@ -3085,15 +3300,11 @@ { "collapsed": true, "datasource": "$ds", - "fieldConfig": { - "defaults": {}, - "overrides": [] - }, "gridPos": { "h": 1, "w": 24, "x": 0, - "y": 43 + "y": 51 }, "id": 45, "panels": [ @@ -3942,11 +4153,34 @@ "tagsQuery": "", "type": "query", "useTags": false + }, + { + "allValue": ".*", + "current": {}, + "datasource": "$ds", + "definition": "label_values(vmagent_remotewrite_requests_total{job=\"$job\", instance=~\"$instance\"}, url)", + "description": "The remote write URLs", + "error": null, + "hide": 0, + "includeAll": true, + "label": null, + "multi": true, + "name": "url", + "options": [], + "query": { + "query": "label_values(vmagent_remotewrite_requests_total{job=\"$job\", instance=~\"$instance\"}, url)", + "refId": "StandardVariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "type": "query" } ] }, "time": { - "from": "now-1h", + "from": "now-30m", "to": "now" }, "timepicker": { diff --git a/deployment/docker/alerts.yml b/deployment/docker/alerts.yml index 7632c4fa1b..e8c5025b51 100644 --- a/deployment/docker/alerts.yml +++ b/deployment/docker/alerts.yml @@ -276,3 +276,16 @@ groups: summary: "Job \"{{ $labels.job }}\" on instance {{ $labels.instance }} fails to push to remote storage" description: "Vmagent fails to push data via remote write protocol to destination \"{{ $labels.url }}\"\n Ensure that destination is up and reachable." + + - alert: RemoteWriteConnectionIsSaturated + expr: rate(vmagent_remotewrite_send_duration_seconds_total[5m]) > 0.9 + for: 15m + labels: + severity: warning + annotations: + dashboard: "http://localhost:3000/d/G7Z9GzMGz?viewPanel=84&var-instance={{ $labels.instance }}" + summary: "Remote write connection from \"{{ $labels.job }}\" (instance {{ $labels.instance }}) to {{ $labels.url }} is saturated" + description: "The remote write connection between vmagent \"{{ $labels.job }}\" (instance {{ $labels.instance }}) and destination \"{{ $labels.url }}\" + is saturated by more than 90% and vmagent won't be able to keep up.\n + This usually means that `-remoteWrite.queues` command-line flag must be increased in order to increase + the number of connections per each remote storage." diff --git a/deployment/docker/docker-compose.yml b/deployment/docker/docker-compose.yml index dbf2746d81..b6be8913ef 100644 --- a/deployment/docker/docker-compose.yml +++ b/deployment/docker/docker-compose.yml @@ -17,7 +17,7 @@ services: grafana: container_name: grafana - image: grafana/grafana:8.0.0 + image: grafana/grafana:8.1.2 depends_on: - "vmselect" ports: