From b457739f87c4bd30ccf2d51f69829b6fa2b2e336 Mon Sep 17 00:00:00 2001 From: Roman Khavronenko Date: Mon, 15 Mar 2021 10:04:24 +0000 Subject: [PATCH] Single dashboard (#1126) * dashboard: update single node dashboard * add panel `Open FDs` for file descriptors metrics; * add panel `Disk writes/reads` to show the real read/write load on storage layer; * add `process_resident_memory_bytes` metric to memory usage panel; * add stats panel to show available CPUs, memory and disk space; * rm flags panel since it didn't prove its usefulness. * alerts: add alert for reaching FDs limit --- dashboards/victoriametrics.json | 3198 ++++++++++++++++++------------- deployment/docker/alerts.yml | 10 + 2 files changed, 1861 insertions(+), 1347 deletions(-) diff --git a/dashboards/victoriametrics.json b/dashboards/victoriametrics.json index 484e972ab..6e6aa5f02 100644 --- a/dashboards/victoriametrics.json +++ b/dashboards/victoriametrics.json @@ -25,12 +25,6 @@ "name": "Singlestat", "version": "" }, - { - "type": "panel", - "id": "table-old", - "name": "Table (old)", - "version": "" - }, { "type": "panel", "id": "text", @@ -51,12 +45,12 @@ } ] }, - "description": "Overview for single node VictoriaMetrics v1.48.0 or higher", + "description": "Overview for single node VictoriaMetrics v1.55.1 or higher", "editable": true, "gnetId": 10229, "graphTooltip": 0, "id": null, - "iteration": 1613320710757, + "iteration": 1615713966732, "links": [ { "icon": "doc", @@ -64,7 +58,7 @@ "targetBlank": true, "title": "Single server Wiki", "type": "link", - "url": "https://github.com/VictoriaMetrics/VictoriaMetrics/wiki/Single-server-VictoriaMetrics" + "url": "https://victoriametrics.github.io/" }, { "icon": "external link", @@ -110,8 +104,8 @@ "overrides": [] }, "gridPos": { - "h": 3, - "w": 8, + "h": 2, + "w": 6, "x": 0, "y": 1 }, @@ -134,98 +128,6 @@ "title": "Version", "type": "text" }, - { - "columns": [], - "datasource": "$ds", - "description": "Run VM with `-help` flag to see all the available flags with description and default values", - "fieldConfig": { - "defaults": { - "custom": {} - }, - "overrides": [] - }, - "fontSize": "100%", - "gridPos": { - "h": 9, - "w": 16, - "x": 8, - "y": 1 - }, - "id": 4, - "links": [], - "pageSize": null, - "scroll": true, - "showHeader": true, - "sort": { - "col": 4, - "desc": true - }, - "styles": [ - { - "alias": "", - "align": "auto", - "colorMode": null, - "colors": [ - "rgba(245, 54, 54, 0.9)", - "rgba(237, 129, 40, 0.89)", - "rgba(50, 172, 45, 0.97)" - ], - "decimals": 2, - "pattern": "name", - "thresholds": [], - "type": "string", - "unit": "short" - }, - { - "alias": "", - "align": "auto", - "colorMode": null, - "colors": [ - "rgba(245, 54, 54, 0.9)", - "rgba(237, 129, 40, 0.89)", - "rgba(50, 172, 45, 0.97)" - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "mappingType": 1, - "pattern": "value", - "thresholds": [], - "type": "string", - "unit": "short" - }, - { - "alias": "", - "align": "auto", - "colorMode": null, - "colors": [ - "rgba(245, 54, 54, 0.9)", - "rgba(237, 129, 40, 0.89)", - "rgba(50, 172, 45, 0.97)" - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "mappingType": 1, - "pattern": "/.*/", - "thresholds": [], - "type": "hidden", - "unit": "short" - } - ], - "targets": [ - { - "expr": "flag{job=\"$job\", instance=~\"$instance\", value!=\"secret\", value!=\"\"}", - "format": "table", - "instant": true, - "intervalFactor": 1, - "refId": "A" - } - ], - "timeFrom": null, - "timeShift": null, - "title": "Flags", - "transform": "table", - "type": "table-old" - }, { "cacheTimeout": null, "colorBackground": false, @@ -253,9 +155,9 @@ }, "gridPos": { "h": 2, - "w": 8, - "x": 0, - "y": 4 + "w": 6, + "x": 6, + "y": 1 }, "id": 26, "interval": null, @@ -316,6 +218,278 @@ ], "valueName": "current" }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "$ds", + "description": "The size of the free disk space left", + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "format": "bytes", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 2, + "w": 6, + "x": 12, + "y": 1 + }, + "id": 80, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": true, + "lineColor": "rgb(31, 120, 193)", + "show": true + }, + "tableColumn": "", + "targets": [ + { + "expr": "sum(vm_free_disk_space_bytes{job=\"$job\", instance=~\"$instance\", path=\"/storage\"})", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "thresholds": "", + "timeFrom": null, + "timeShift": null, + "title": "Free disk space", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "$ds", + "description": "Total size of available memory for VM process", + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "format": "bytes", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 2, + "w": 6, + "x": 18, + "y": 1 + }, + "id": 78, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": true, + "lineColor": "rgb(31, 120, 193)", + "show": true + }, + "tableColumn": "", + "targets": [ + { + "expr": "sum(vm_available_memory_bytes{job=\"$job\", instance=~\"$instance\"})", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "thresholds": "", + "timeFrom": null, + "timeShift": null, + "title": "Available memory", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "$ds", + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "format": "s", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 2, + "w": 6, + "x": 0, + "y": 3 + }, + "id": 8, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "vm_app_uptime_seconds{instance=\"victoriametrics:8428\", job=\"victoriametrics\"}", + "targets": [ + { + "expr": "vm_app_uptime_seconds{job=\"$job\", instance=\"$instance\"}", + "format": "time_series", + "intervalFactor": 1, + "refId": "A" + } + ], + "thresholds": "", + "timeFrom": null, + "timeShift": null, + "title": "Uptime", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, { "cacheTimeout": null, "colorBackground": false, @@ -343,9 +517,9 @@ }, "gridPos": { "h": 2, - "w": 8, - "x": 0, - "y": 6 + "w": 6, + "x": 6, + "y": 3 }, "id": 38, "interval": null, @@ -416,13 +590,14 @@ "#d44a3a" ], "datasource": "$ds", + "description": "Total number of available CPUs for VM process", "fieldConfig": { "defaults": { "custom": {} }, "overrides": [] }, - "format": "s", + "format": "short", "gauge": { "maxValue": 100, "minValue": 0, @@ -432,11 +607,11 @@ }, "gridPos": { "h": 2, - "w": 8, - "x": 0, - "y": 8 + "w": 6, + "x": 12, + "y": 3 }, - "id": 8, + "id": 77, "interval": null, "links": [], "mappingType": 1, @@ -466,23 +641,118 @@ ], "sparkline": { "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, + "full": true, "lineColor": "rgb(31, 120, 193)", - "show": false + "show": true }, - "tableColumn": "vm_app_uptime_seconds{instance=\"victoriametrics:8428\", job=\"victoriametrics\"}", + "tableColumn": "", "targets": [ { - "expr": "vm_app_uptime_seconds{job=\"$job\", instance=\"$instance\"}", + "expr": "sum(vm_available_cpu_cores{job=\"$job\", instance=~\"$instance\"})", "format": "time_series", + "instant": false, + "interval": "", "intervalFactor": 1, + "legendFormat": "", "refId": "A" } ], "thresholds": "", "timeFrom": null, "timeShift": null, - "title": "Uptime", + "title": "Available CPU", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "$ds", + "description": "Total size of allowed memory via flag `-memory.allowedPercent`", + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "format": "bytes", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 2, + "w": 6, + "x": 18, + "y": 3 + }, + "id": 79, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": true, + "lineColor": "rgb(31, 120, 193)", + "show": true + }, + "tableColumn": "", + "targets": [ + { + "expr": "sum(vm_allowed_memory_bytes{job=\"$job\", instance=~\"$instance\"})", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "thresholds": "", + "timeFrom": null, + "timeShift": null, + "title": "Allowed memory", "type": "singlestat", "valueFontSize": "80%", "valueMaps": [ @@ -501,7 +771,7 @@ "h": 1, "w": 24, "x": 0, - "y": 10 + "y": 5 }, "id": 24, "panels": [], @@ -528,7 +798,7 @@ "h": 8, "w": 12, "x": 0, - "y": 11 + "y": 6 }, "hiddenSeries": false, "id": 12, @@ -628,7 +898,7 @@ "h": 8, "w": 12, "x": 12, - "y": 11 + "y": 6 }, "hiddenSeries": false, "id": 22, @@ -727,7 +997,7 @@ "h": 8, "w": 12, "x": 0, - "y": 19 + "y": 14 }, "hiddenSeries": false, "id": 51, @@ -829,7 +1099,7 @@ "h": 8, "w": 12, "x": 12, - "y": 19 + "y": 14 }, "hiddenSeries": false, "id": 33, @@ -939,7 +1209,7 @@ "h": 8, "w": 12, "x": 0, - "y": 27 + "y": 22 }, "hiddenSeries": false, "id": 59, @@ -1055,7 +1325,7 @@ "h": 8, "w": 12, "x": 12, - "y": 27 + "y": 22 }, "hiddenSeries": false, "id": 35, @@ -1135,1124 +1405,1238 @@ } }, { - "collapsed": false, + "collapsed": true, "datasource": "$ds", "gridPos": { "h": 1, "w": 24, "x": 0, - "y": 35 + "y": 30 }, "id": 14, - "panels": [], + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$ds", + "description": "How many datapoints are inserted into storage per second", + "fieldConfig": { + "defaults": { + "custom": {}, + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 40 + }, + "hiddenSeries": false, + "id": 10, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideZero": true, + "max": false, + "min": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null as zero", + "percentage": false, + "pluginVersion": "7.1.1", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(vm_rows_inserted_total{job=\"$job\", instance=\"$instance\"}[$__interval])) by (type) > 0", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "{{type}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Datapoints ingestion rate ($instance)", + "tooltip": { + "shared": true, + "sort": 2, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": null, + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$ds", + "description": "Shows the time needed to reach the 100% of disk capacity based on the following params:\n* free disk space;\n* row ingestion rate;\n* dedup rate;\n* compression.\n\nUse this panel for capacity planning in order to estimate the time remaining for running out of the disk space.\n\n", + "fieldConfig": { + "defaults": { + "custom": {}, + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 40 + }, + "hiddenSeries": false, + "id": 73, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideZero": true, + "max": false, + "min": false, + "show": false, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null as zero", + "percentage": false, + "pluginVersion": "7.1.1", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "vm_free_disk_space_bytes{job=\"$job\", instance=\"$instance\"} / ignoring(path) ((rate(vm_rows_added_to_storage_total{job=\"$job\", instance=\"$instance\"}[1d]) - ignoring(type) rate(vm_deduplicated_samples_total{job=\"$job\", instance=\"$instance\", type=\"merge\"}[1d])) * scalar(sum(vm_data_size_bytes{job=\"$job\", instance=\"$instance\", type!=\"indexdb\"}) / sum(vm_rows{job=\"$job\", instance=\"$instance\", type!=\"indexdb\"})))", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Storage full ETA ($instance)", + "tooltip": { + "shared": true, + "sort": 2, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": null, + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$ds", + "description": "Shows how many datapoints are in the storage and what is average disk usage per datapoint.", + "fieldConfig": { + "defaults": { + "custom": {}, + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 48 + }, + "hiddenSeries": false, + "id": 30, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pluginVersion": "7.1.1", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "bytes-per-datapoint", + "yaxis": 2 + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(vm_rows{job=\"$job\", instance=~\"$instance\", type != \"indexdb\"})", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "total datapoints", + "refId": "A" + }, + { + "expr": "sum(vm_data_size_bytes{job=\"$job\", instance=~\"$instance\", type!=\"indexdb\"}) / sum(vm_rows{job=\"$job\", instance=~\"$instance\", type != \"indexdb\"})", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "bytes-per-datapoint", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Datapoints ($instance)", + "tooltip": { + "shared": true, + "sort": 2, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "decimals": 2, + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$ds", + "description": "How many datapoints are in RAM queue waiting to be written into storage. The number of pending data points should be in the range from 0 to `2*`, since VictoriaMetrics pushes pending data to persistent storage every second.", + "fieldConfig": { + "defaults": { + "custom": {}, + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 48 + }, + "hiddenSeries": false, + "id": 34, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pluginVersion": "7.1.1", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "pending index entries", + "yaxis": 2 + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "vm_pending_rows{job=\"$job\", instance=~\"$instance\", type=\"storage\"}", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "pending datapoints", + "refId": "A" + }, + { + "expr": "vm_pending_rows{job=\"$job\", instance=~\"$instance\", type=\"indexdb\"}", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "pending index entries", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Pending datapoints ($instance)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "decimals": 3, + "format": "none", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$ds", + "description": "Shows amount of on-disk space occupied by data points and the remaining disk space at `-storageDataPath`", + "fieldConfig": { + "defaults": { + "custom": {}, + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 56 + }, + "hiddenSeries": false, + "id": 53, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pluginVersion": "7.1.1", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(vm_data_size_bytes{job=\"$job\", instance=~\"$instance\", type!=\"indexdb\"})", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "Used", + "refId": "A" + }, + { + "expr": "vm_free_disk_space_bytes{job=\"$job\", instance=\"$instance\"}", + "format": "time_series", + "interval": "", + "intervalFactor": 1, + "legendFormat": "Free", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Disk space usage - datapoints ($instance)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$ds", + "description": "Data parts of LSM tree.\nHigh number of parts could be an evidence of slow merge performance - check the resource utilization.\n* `indexdb` - inverted index\n* `storage/small` - recently added parts of data ingested into storage(hot data)\n* `storage/big` - small parts gradually merged into big parts (cold data)", + "fieldConfig": { + "defaults": { + "custom": {}, + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 56 + }, + "hiddenSeries": false, + "id": 36, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pluginVersion": "7.1.1", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(vm_parts{job=\"$job\", instance=\"$instance\"}) by (type)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{type}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "LSM parts ($instance)", + "tooltip": { + "shared": true, + "sort": 2, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$ds", + "description": "Shows amount of on-disk space occupied by inverted index.", + "fieldConfig": { + "defaults": { + "custom": {}, + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 64 + }, + "hiddenSeries": false, + "id": 55, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pluginVersion": "7.1.1", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "vm_data_size_bytes{job=\"$job\", instance=~\"$instance\", type=\"indexdb\"}", + "format": "time_series", + "intervalFactor": 1, + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Disk space usage - index ($instance)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$ds", + "description": "The number of on-going merges in storage nodes. It is expected to have high numbers for `storage/small` metric.", + "fieldConfig": { + "defaults": { + "custom": {}, + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 64 + }, + "hiddenSeries": false, + "id": 62, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "percentage": false, + "pluginVersion": "7.1.1", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(vm_active_merges{job=\"$job\", instance=\"$instance\"}) by(type)", + "legendFormat": "{{type}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Active merges ($instance)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": 0, + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$ds", + "description": "Shows the number of bytes read/write from the storage layer.", + "fieldConfig": { + "defaults": { + "custom": {}, + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 72 + }, + "hiddenSeries": false, + "id": 76, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pluginVersion": "7.1.1", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "read", + "transform": "negative-Y" + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(process_io_storage_read_bytes_total{job=\"$job\", instance=\"$instance\"}[5m]))", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "read", + "refId": "A" + }, + { + "expr": "sum(rate(process_io_storage_written_bytes_total{job=\"$job\", instance=\"$instance\"}[5m]))", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "write", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Disk writes/reads ($instance)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": null, + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$ds", + "description": "The number of rows merged per second by storage nodes.", + "fieldConfig": { + "defaults": { + "custom": {}, + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 72 + }, + "hiddenSeries": false, + "id": 64, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "percentage": false, + "pluginVersion": "7.1.1", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(vm_rows_merged_total{job=\"$job\", instance=\"$instance\"}[5m])) by(type)", + "legendFormat": "{{type}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Merge speed ($instance)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": 0, + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$ds", + "description": "Shows how many rows were ignored on insertion due to corrupted or out of retention timestamps.", + "fieldConfig": { + "defaults": { + "custom": {}, + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 80 + }, + "hiddenSeries": false, + "id": 58, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pluginVersion": "7.1.1", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(vm_rows_ignored_total{job=\"$job\", instance=\"$instance\"}) by (reason) > 0", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "{{reason}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Rows ignored ($instance)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": null, + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$ds", + "description": "Shows the rate of logging the messages by their level. Unexpected spike in rate is a good reason to check logs.", + "fieldConfig": { + "defaults": { + "custom": {}, + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 80 + }, + "hiddenSeries": false, + "id": 67, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pluginVersion": "7.1.1", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(vm_log_messages_total{job=\"$job\", instance=\"$instance\"}[5m])) by (level) ", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "{{level}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Logging rate ($instance)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": null, + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], "title": "Storage", "type": "row" }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$ds", - "description": "How many datapoints are inserted into storage per second", - "fieldConfig": { - "defaults": { - "custom": {}, - "links": [] - }, - "overrides": [] - }, - "fill": 1, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 36 - }, - "hiddenSeries": false, - "id": 10, - "legend": { - "alignAsTable": true, - "avg": true, - "current": true, - "hideZero": true, - "max": false, - "min": false, - "show": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "null as zero", - "percentage": false, - "pluginVersion": "7.1.1", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(rate(vm_rows_inserted_total{job=\"$job\", instance=\"$instance\"}[$__interval])) by (type) > 0", - "format": "time_series", - "hide": false, - "intervalFactor": 1, - "legendFormat": "{{type}}", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Datapoints ingestion rate ($instance)", - "tooltip": { - "shared": true, - "sort": 2, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "decimals": null, - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": "0", - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$ds", - "description": "Shows the time needed to reach the 100% of disk capacity based on the following params:\n* free disk space;\n* row ingestion rate;\n* dedup rate;\n* compression.\n\nUse this panel for capacity planning in order to estimate the time remaining for running out of the disk space.\n\n", - "fieldConfig": { - "defaults": { - "custom": {}, - "links": [] - }, - "overrides": [] - }, - "fill": 1, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 36 - }, - "hiddenSeries": false, - "id": 73, - "legend": { - "alignAsTable": true, - "avg": true, - "current": true, - "hideZero": true, - "max": false, - "min": false, - "show": false, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "null as zero", - "percentage": false, - "pluginVersion": "7.1.1", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "vm_free_disk_space_bytes{job=\"$job\", instance=\"$instance\"} / ignoring(path) ((rate(vm_rows_added_to_storage_total{job=\"$job\", instance=\"$instance\"}[1d]) - ignoring(type) rate(vm_deduplicated_samples_total{job=\"$job\", instance=\"$instance\", type=\"merge\"}[1d])) * scalar(sum(vm_data_size_bytes{job=\"$job\", instance=\"$instance\", type!=\"indexdb\"}) / sum(vm_rows{job=\"$job\", instance=\"$instance\", type!=\"indexdb\"})))", - "format": "time_series", - "hide": false, - "interval": "", - "intervalFactor": 1, - "legendFormat": "", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Storage full ETA ($instance)", - "tooltip": { - "shared": true, - "sort": 2, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "decimals": null, - "format": "s", - "label": null, - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": "0", - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$ds", - "description": "Shows how many datapoints are in the storage and what is average disk usage per datapoint.", - "fieldConfig": { - "defaults": { - "custom": {}, - "links": [] - }, - "overrides": [] - }, - "fill": 1, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 44 - }, - "hiddenSeries": false, - "id": 30, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": false, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "null", - "percentage": false, - "pluginVersion": "7.1.1", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "alias": "bytes-per-datapoint", - "yaxis": 2 - } - ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(vm_rows{job=\"$job\", instance=~\"$instance\", type != \"indexdb\"})", - "format": "time_series", - "interval": "", - "intervalFactor": 1, - "legendFormat": "total datapoints", - "refId": "A" - }, - { - "expr": "sum(vm_data_size_bytes{job=\"$job\", instance=~\"$instance\", type!=\"indexdb\"}) / sum(vm_rows{job=\"$job\", instance=~\"$instance\", type != \"indexdb\"})", - "format": "time_series", - "interval": "", - "intervalFactor": 1, - "legendFormat": "bytes-per-datapoint", - "refId": "B" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Datapoints ($instance)", - "tooltip": { - "shared": true, - "sort": 2, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "decimals": 2, - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": "0", - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$ds", - "description": "How many datapoints are in RAM queue waiting to be written into storage. The number of pending data points should be in the range from 0 to `2*`, since VictoriaMetrics pushes pending data to persistent storage every second.", - "fieldConfig": { - "defaults": { - "custom": {}, - "links": [] - }, - "overrides": [] - }, - "fill": 1, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 44 - }, - "hiddenSeries": false, - "id": 34, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": false, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "null", - "percentage": false, - "pluginVersion": "7.1.1", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "alias": "pending index entries", - "yaxis": 2 - } - ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "vm_pending_rows{job=\"$job\", instance=~\"$instance\", type=\"storage\"}", - "format": "time_series", - "hide": false, - "intervalFactor": 1, - "legendFormat": "pending datapoints", - "refId": "A" - }, - { - "expr": "vm_pending_rows{job=\"$job\", instance=~\"$instance\", type=\"indexdb\"}", - "format": "time_series", - "hide": false, - "intervalFactor": 1, - "legendFormat": "pending index entries", - "refId": "B" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Pending datapoints ($instance)", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "decimals": 3, - "format": "none", - "label": null, - "logBase": 1, - "max": null, - "min": "0", - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$ds", - "description": "Shows amount of on-disk space occupied by data points and the remaining disk space at `-storageDataPath`", - "fieldConfig": { - "defaults": { - "custom": {}, - "links": [] - }, - "overrides": [] - }, - "fill": 1, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 52 - }, - "hiddenSeries": false, - "id": 53, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "null", - "percentage": false, - "pluginVersion": "7.1.1", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(vm_data_size_bytes{job=\"$job\", instance=~\"$instance\", type!=\"indexdb\"})", - "format": "time_series", - "interval": "", - "intervalFactor": 1, - "legendFormat": "Used", - "refId": "A" - }, - { - "expr": "vm_free_disk_space_bytes{job=\"$job\", instance=\"$instance\"}", - "format": "time_series", - "interval": "", - "intervalFactor": 1, - "legendFormat": "Free", - "refId": "B" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Disk space usage - datapoints ($instance)", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": "0", - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$ds", - "description": "Data parts of LSM tree.\nHigh number of parts could be an evidence of slow merge performance - check the resource utilization.\n* `indexdb` - inverted index\n* `storage/small` - recently added parts of data ingested into storage(hot data)\n* `storage/big` - small parts gradually merged into big parts (cold data)", - "fieldConfig": { - "defaults": { - "custom": {}, - "links": [] - }, - "overrides": [] - }, - "fill": 1, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 52 - }, - "hiddenSeries": false, - "id": 36, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "null", - "percentage": false, - "pluginVersion": "7.1.1", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(vm_parts{job=\"$job\", instance=\"$instance\"}) by (type)", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "{{type}}", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "LSM parts ($instance)", - "tooltip": { - "shared": true, - "sort": 2, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": "0", - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$ds", - "description": "The number of on-going merges in storage nodes. It is expected to have high numbers for `storage/small` metric.", - "fieldConfig": { - "defaults": { - "custom": {}, - "links": [] - }, - "overrides": [] - }, - "fill": 1, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 60 - }, - "hiddenSeries": false, - "id": 62, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "percentage": false, - "pluginVersion": "7.1.1", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(vm_active_merges{job=\"$job\", instance=\"$instance\"}) by(type)", - "legendFormat": "{{type}}", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Active merges ($instance)", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "decimals": 0, - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": "0", - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$ds", - "description": "Shows amount of on-disk space occupied by inverted index.", - "fieldConfig": { - "defaults": { - "custom": {}, - "links": [] - }, - "overrides": [] - }, - "fill": 1, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 60 - }, - "hiddenSeries": false, - "id": 55, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": false, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "null", - "percentage": false, - "pluginVersion": "7.1.1", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "vm_data_size_bytes{job=\"$job\", instance=~\"$instance\", type=\"indexdb\"}", - "format": "time_series", - "intervalFactor": 1, - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Disk space usage - index ($instance)", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": "0", - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$ds", - "description": "Shows how many rows were ignored on insertion due to corrupted or out of retention timestamps.", - "fieldConfig": { - "defaults": { - "custom": {}, - "links": [] - }, - "overrides": [] - }, - "fill": 1, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 68 - }, - "hiddenSeries": false, - "id": 58, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": false, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "null", - "percentage": false, - "pluginVersion": "7.1.1", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(vm_rows_ignored_total{job=\"$job\", instance=\"$instance\"}) by (reason) > 0", - "format": "time_series", - "hide": false, - "intervalFactor": 1, - "legendFormat": "{{reason}}", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Rows ignored ($instance)", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "decimals": null, - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": "0", - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$ds", - "description": "The number of rows merged per second by storage nodes.", - "fieldConfig": { - "defaults": { - "custom": {}, - "links": [] - }, - "overrides": [] - }, - "fill": 1, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 68 - }, - "hiddenSeries": false, - "id": 64, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "nullPointMode": "null", - "percentage": false, - "pluginVersion": "7.1.1", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(rate(vm_rows_merged_total{job=\"$job\", instance=\"$instance\"}[5m])) by(type)", - "legendFormat": "{{type}}", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Merge speed ($instance)", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "decimals": 0, - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": "0", - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$ds", - "description": "Shows the rate of logging the messages by their level. Unexpected spike in rate is a good reason to check logs.", - "fieldConfig": { - "defaults": { - "custom": {}, - "links": [] - }, - "overrides": [] - }, - "fill": 1, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 12, - "y": 76 - }, - "hiddenSeries": false, - "id": 67, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "null", - "percentage": false, - "pluginVersion": "7.1.1", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(rate(vm_log_messages_total{job=\"$job\", instance=\"$instance\"}[5m])) by (level) ", - "format": "time_series", - "hide": false, - "intervalFactor": 1, - "legendFormat": "{{level}}", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Logging rate ($instance)", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "decimals": null, - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": "0", - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, { "collapsed": true, "datasource": "$ds", @@ -2260,7 +2644,7 @@ "h": 1, "w": 24, "x": 0, - "y": 84 + "y": 31 }, "id": 71, "panels": [ @@ -2664,7 +3048,7 @@ "h": 1, "w": 24, "x": 0, - "y": 85 + "y": 32 }, "id": 46, "panels": [ @@ -2688,7 +3072,7 @@ "h": 8, "w": 12, "x": 0, - "y": 102 + "y": 103 }, "hiddenSeries": false, "id": 44, @@ -2738,6 +3122,15 @@ "intervalFactor": 1, "legendFormat": "stack inuse", "refId": "C" + }, + { + "expr": "sum(process_resident_memory_bytes{job=\"$job\", instance=\"$instance\"})", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "resident", + "refId": "D" } ], "thresholds": [], @@ -2747,7 +3140,7 @@ "title": "Memory usage ($instance)", "tooltip": { "shared": true, - "sort": 0, + "sort": 2, "value_type": "individual" }, "type": "graph", @@ -2800,7 +3193,7 @@ "h": 8, "w": 12, "x": 12, - "y": 102 + "y": 103 }, "hiddenSeries": false, "id": 57, @@ -2883,6 +3276,7 @@ "dashLength": 10, "dashes": false, "datasource": "$ds", + "description": "Panel shows the number of open file descriptors in the OS.\nReaching the limit of open files can cause various issues and must be prevented.\n\nSee how to change limits here https://medium.com/@muhammadtriwibowo/set-permanently-ulimit-n-open-files-in-ubuntu-4d61064429a", "fieldConfig": { "defaults": { "custom": {}, @@ -2896,10 +3290,10 @@ "h": 8, "w": 12, "x": 0, - "y": 110 + "y": 111 }, "hiddenSeries": false, - "id": 47, + "id": 75, "legend": { "avg": false, "current": false, @@ -2918,24 +3312,38 @@ "pointradius": 2, "points": false, "renderer": "flot", - "seriesOverrides": [], + "seriesOverrides": [ + { + "alias": "max", + "color": "#C4162A" + } + ], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(go_goroutines{job=\"$job\", instance=\"$instance\"})", + "expr": "sum(process_open_fds{job=\"$job\", instance=\"$instance\"})", "format": "time_series", + "interval": "", "intervalFactor": 2, - "legendFormat": "gc duration", + "legendFormat": "open", "refId": "A" + }, + { + "expr": "min(process_max_fds{job=\"$job\", instance=\"$instance\"})", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "max", + "refId": "B" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, - "title": "Goroutines ($instance)", + "title": "Open FDs ($instance)", "tooltip": { "shared": true, "sort": 0, @@ -2954,7 +3362,7 @@ "decimals": 0, "format": "short", "label": null, - "logBase": 1, + "logBase": 2, "max": null, "min": "0", "show": true @@ -2992,7 +3400,7 @@ "h": 8, "w": 12, "x": 12, - "y": 110 + "y": 111 }, "hiddenSeries": false, "id": 42, @@ -3087,7 +3495,201 @@ "h": 8, "w": 12, "x": 0, - "y": 118 + "y": 119 + }, + "hiddenSeries": false, + "id": 47, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pluginVersion": "7.1.1", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(go_goroutines{job=\"$job\", instance=\"$instance\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "gc duration", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Goroutines ($instance)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": 0, + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$ds", + "description": "", + "fieldConfig": { + "defaults": { + "custom": {}, + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 119 + }, + "hiddenSeries": false, + "id": 37, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pluginVersion": "7.1.1", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(vm_tcplistener_conns{job=\"$job\", instance=\"$instance\"})", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "connections", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "TCP connections ($instance)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": null, + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$ds", + "fieldConfig": { + "defaults": { + "custom": {}, + "links": [] + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 127 }, "hiddenSeries": false, "id": 48, @@ -3184,105 +3786,7 @@ "h": 8, "w": 12, "x": 12, - "y": 118 - }, - "hiddenSeries": false, - "id": 37, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": false, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "null", - "percentage": false, - "pluginVersion": "7.1.1", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(vm_tcplistener_conns{job=\"$job\", instance=\"$instance\"})", - "format": "time_series", - "hide": false, - "intervalFactor": 1, - "legendFormat": "connections", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "TCP connections ($instance)", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "decimals": null, - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": "0", - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$ds", - "description": "", - "fieldConfig": { - "defaults": { - "custom": {}, - "links": [] - }, - "overrides": [] - }, - "fill": 1, - "fillGradient": 0, - "gridPos": { - "h": 8, - "w": 12, - "x": 0, - "y": 126 + "y": 127 }, "hiddenSeries": false, "id": 49, diff --git a/deployment/docker/alerts.yml b/deployment/docker/alerts.yml index 77f72854b..a0a32cdbd 100644 --- a/deployment/docker/alerts.yml +++ b/deployment/docker/alerts.yml @@ -122,6 +122,16 @@ groups: description: "High rate of slow inserts on \"{{ $labels.instance }}\" may be a sign of resource exhaustion for the current load. It is likely more RAM is needed for optimal handling of the current number of active time series." + - alert: ProcessNearFDLimits + expr: process_open_fds / process_max_fds > 0.8 + for: 10m + labels: + severity: critical + annotations: + summary: "Number of free file descriptors is less than 20% for \"{{ $labels.job }}\"(\"{{ $labels.instance }}\") for the last 10m" + description: "Exhausting OS file descriptors limit can cause severe degradation of the process. + Consider to increase the limit as fast as possible." + # Alerts group for vmagent assumes that Grafana dashboard # https://grafana.com/grafana/dashboards/12683 is installed. # Pls update the `dashboard` annotation according to your setup.