diff --git a/docs/node-mixin/dashboards/node.libsonnet b/docs/node-mixin/dashboards/node.libsonnet index 115d98cf..040d60a3 100644 --- a/docs/node-mixin/dashboards/node.libsonnet +++ b/docs/node-mixin/dashboards/node.libsonnet @@ -21,22 +21,23 @@ local gauge = promgrafonnet.gauge; ) .addTarget(prometheus.target( ||| - 1 - avg by (cpu) (irate(node_cpu{%(nodeExporterSelector)s, mode="idle", instance="$instance"}[1m])) + 1 - avg by (cpu) (irate(node_cpu_seconds_total{%(nodeExporterSelector)s, mode="idle", instance="$instance"}[1m])) ||| % $._config, legendFormat='{{cpu}}', intervalFactor=10, )); + // TODO: Is this panel useful? local systemLoad = graphPanel.new( - 'System load', + 'Load Average', datasource='$datasource', span=6, - format='percentunit', + format='short', ) - .addTarget(prometheus.target('node_load1{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='load 1m')) - .addTarget(prometheus.target('node_load5{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='load 5m')) - .addTarget(prometheus.target('node_load15{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='load 15m')); + .addTarget(prometheus.target('node_load1{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='1m load average')) + .addTarget(prometheus.target('node_load5{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='5m load average')) + .addTarget(prometheus.target('node_load15{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='15m load average')); local memoryGraph = graphPanel.new( @@ -48,27 +49,27 @@ local gauge = promgrafonnet.gauge; .addTarget(prometheus.target( ||| ( - node_memory_MemTotal{%(nodeExporterSelector)s, instance="$instance"} + node_memory_MemTotal_bytes{%(nodeExporterSelector)s, instance="$instance"} - - node_memory_MemFree{%(nodeExporterSelector)s, instance="$instance"} + node_memory_MemFree_bytes{%(nodeExporterSelector)s, instance="$instance"} - - node_memory_Buffers{%(nodeExporterSelector)s, instance="$instance"} + node_memory_Buffers_bytes{%(nodeExporterSelector)s, instance="$instance"} - - node_memory_Cached{%(nodeExporterSelector)s, instance="$instance"} + node_memory_Cached_bytes{%(nodeExporterSelector)s, instance="$instance"} ) ||| % $._config, legendFormat='memory used' )) - .addTarget(prometheus.target('node_memory_Buffers{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='memory buffers')) - .addTarget(prometheus.target('node_memory_Cached{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='memory cached')) - .addTarget(prometheus.target('node_memory_MemFree{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='memory free')); + .addTarget(prometheus.target('node_memory_Buffers_bytes{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='memory buffers')) + .addTarget(prometheus.target('node_memory_Cached_bytes{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='memory cached')) + .addTarget(prometheus.target('node_memory_MemFree_bytes{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='memory free')); local memoryGauge = gauge.new( 'Memory Usage', ||| ( - node_memory_MemAvailable{%(nodeExporterSelector)s, instance="$instance"} + node_memory_MemAvailable_bytes{%(nodeExporterSelector)s, instance="$instance"} / - node_memory_MemTotal{%(nodeExporterSelector)s, instance="$instance"} + node_memory_MemTotal_bytes{%(nodeExporterSelector)s, instance="$instance"} ) * 100 ||| % $._config, @@ -80,9 +81,9 @@ local gauge = promgrafonnet.gauge; datasource='$datasource', span=9, ) - .addTarget(prometheus.target('sum by (instance) (irate(node_disk_bytes_read_total{%(nodeExporterSelector)s, instance="$instance"}[1m]))' % $._config, legendFormat='read')) - .addTarget(prometheus.target('sum by (instance) (irate(node_disk_bytes_written_total{%(nodeExporterSelector)s, instance="$instance"}[1m]))' % $._config, legendFormat='written')) - .addTarget(prometheus.target('sum by (instance) (irate(node_disk_io_time_ms{%(nodeExporterSelector)s, instance="$instance"}[1m]))' % $._config, legendFormat='io time')) + + .addTarget(prometheus.target('sum by (instance) (irate(node_disk_read_bytes_total{%(nodeExporterSelector)s, instance="$instance"}[1m]))' % $._config, legendFormat='read')) + .addTarget(prometheus.target('sum by (instance) (irate(node_disk_written_bytes_total{%(nodeExporterSelector)s, instance="$instance"}[1m]))' % $._config, legendFormat='written')) + .addTarget(prometheus.target('sum by (instance) (irate(node_disk_io_time_seconds_total{%(nodeExporterSelector)s, instance="$instance"}[1m]))' % $._config, legendFormat='io time')) + { seriesOverrides: [ { @@ -96,18 +97,19 @@ local gauge = promgrafonnet.gauge; ], yaxes: [ self.yaxe(format='bytes'), - self.yaxe(format='ms'), + self.yaxe(format='s'), ], }; + // TODO: Should this be partitioned by mountpoint? local diskSpaceUsage = gauge.new( 'Disk Space Usage', ||| 100 - ( - sum(node_filesystem_free{%(nodeExporterSelector)s, device!="rootfs", instance="$instance"} + sum(node_filesystem_avail_bytes{%(nodeExporterSelector)s, %(fsSelector)s, instance="$instance"} / - sum(node_filesystem_size{%(nodeExporterSelector)s, device!="rootfs", instance="$instance"} + sum(node_filesystem_size_bytes{%(nodeExporterSelector)s, %(fsSelector)s, instance="$instance"} * 100 ) ||| % $._config, @@ -120,7 +122,7 @@ local gauge = promgrafonnet.gauge; span=6, format='bytes', ) - .addTarget(prometheus.target('irate(node_network_receive_bytes{%(nodeExporterSelector)s, instance="$instance", device!~"lo"}[1m])' % $._config, legendFormat='{{device}}')); + .addTarget(prometheus.target('irate(node_network_receive_bytes_total{%(nodeExporterSelector)s, instance="$instance", device!="lo"}[1m])' % $._config, legendFormat='{{device}}')); local networkTransmitted = graphPanel.new( @@ -129,7 +131,7 @@ local gauge = promgrafonnet.gauge; span=6, format='bytes', ) - .addTarget(prometheus.target('irate(node_network_transmit_bytes{%(nodeExporterSelector)s, instance="$instance", device!~"lo"}[1m])' % $._config, legendFormat='{{device}}')); + .addTarget(prometheus.target('irate(node_network_transmit_bytes_total{%(nodeExporterSelector)s, instance="$instance", device!="lo"}[1m])' % $._config, legendFormat='{{device}}')); dashboard.new('Nodes', time_from='now-1h') .addTemplate( @@ -152,7 +154,7 @@ local gauge = promgrafonnet.gauge; template.new( 'instance', '$datasource', - 'label_values(node_boot_time{%(nodeExporterSelector)s}, instance)' % $._config, + 'label_values(node_boot_time_seconds{%(nodeExporterSelector)s}, instance)' % $._config, refresh='time', ) ) diff --git a/docs/node-mixin/dashboards/use.libsonnet b/docs/node-mixin/dashboards/use.libsonnet index 9bba6043..96bf0f59 100644 --- a/docs/node-mixin/dashboards/use.libsonnet +++ b/docs/node-mixin/dashboards/use.libsonnet @@ -10,16 +10,30 @@ local g = import 'grafana-builder/grafana.libsonnet'; g.row('CPU') .addPanel( g.panel('CPU Utilisation') + - g.queryPanel('instance:node_cpu_utilisation:avg1m * instance:node_num_cpu:sum / scalar(sum(instance:node_num_cpu:sum))', '{{instance}}', legendLink) + + g.queryPanel(||| + ( + instance:node_cpu_utilisation:avg1m + * + instance:node_num_cpu:sum + / ignoring (instance) group_left + sum without (instance) (instance:node_num_cpu:sum) + ) + |||, '{{instance}}', legendLink) + g.stack + { yaxes: g.yaxes({ format: 'percentunit', max: 1 }) }, ) .addPanel( - g.panel('CPU Saturation (Load1)') + + // TODO: Is this a useful panel? + g.panel('CPU Saturation (load1 per CPU)') + g.queryPanel(||| - instance:node_cpu_saturation_load1: / scalar(sum(up{%(nodeExporterSelector)s})) - ||| % $._config, '{{instance}}', legendLink) + + ( + instance:node_load1_per_cpu:ratio + / ignoring (instance) group_left + count without (instance) (instance:node_load1_per_cpu:ratio) + ) + |||, '{{instance}}', legendLink) + g.stack + + // TODO: Does `max: 1` make sense? The stack can go over 1 in high-load scenarios. { yaxes: g.yaxes({ format: 'percentunit', max: 1 }) }, ) ) @@ -43,16 +57,26 @@ local g = import 'grafana-builder/grafana.libsonnet'; .addPanel( g.panel('Disk IO Utilisation') + // Full utilisation would be all disks on each node spending an average of - // 1 sec per second doing I/O, normalize by node count for stacked charts - g.queryPanel('instance:node_disk_utilisation:sum_irate / scalar(sum(up{%(nodeExporterSelector)s}))' % $._config, '{{instance}}', legendLink) + + // 1 second per second doing I/O, normalize by metric cardinality for stacked charts. + g.queryPanel(||| + ( + instance:node_disk_utilisation:sum_irate + / ignoring (instance) group_left + count without (instance) (instance:node_disk_utilisation:sum_irate) + ) + |||, '{{instance}}', legendLink) + g.stack + { yaxes: g.yaxes({ format: 'percentunit', max: 1 }) }, ) .addPanel( g.panel('Disk IO Saturation') + g.queryPanel(||| - instance:node_disk_saturation:sum_irate / scalar(sum(up{%(nodeExporterSelector)s})) - ||| % $._config, '{{instance}}', legendLink) + + ( + instance:node_disk_saturation:sum_irate + / ignoring (instance) group_left + count without (instance) (instance:node_disk_saturation:sum_irate) + ) + |||, '{{instance}}', legendLink) + g.stack + { yaxes: g.yaxes({ format: 'percentunit', max: 1 }) }, ) @@ -76,7 +100,21 @@ local g = import 'grafana-builder/grafana.libsonnet'; g.row('Storage') .addPanel( g.panel('Disk Capacity') + - g.queryPanel('sum(max(node_filesystem_size_bytes{fstype=~"ext[24]"} - node_filesystem_free_bytes{fstype=~"ext[24]"}) by (device,instance,namespace)) by (instance,namespace) / scalar(sum(max(node_filesystem_size_bytes{fstype=~"ext[24]"}) by (device,instance,namespace)))', '{{instance}}', legendLink) + + g.queryPanel(||| + ( + sum without (device) ( + max without (fstype, mountpoint) ( + node_filesystem_size_bytes{fstype=~"ext[24]"} - node_filesystem_avail_bytes{fstype=~"ext[24]"} + ) + ) + / ignoring (instance) group_left + sum without (instance, device) ( + max without (fstype, mountpoint) ( + node_filesystem_size_bytes{fstype=~"ext[24]"} + ) + ) + ) + |||, '{{instance}}', legendLink) + g.stack + { yaxes: g.yaxes({ format: 'percentunit', max: 1 }) }, ), @@ -106,9 +144,9 @@ local g = import 'grafana-builder/grafana.libsonnet'; { yaxes: g.yaxes('percentunit') }, ) .addPanel( - g.panel('Memory Saturation (Swap I/O)') + - g.queryPanel('instance:node_memory_swap_io_bytes:sum_rate{instance="$instance"}', 'Swap IO') + - { yaxes: g.yaxes('Bps') }, + g.panel('Memory Saturation (pages swapped per second)') + + g.queryPanel('instance:node_memory_swap_io_pages:sum_rate{instance="$instance"}', 'Swap IO') + + { yaxes: g.yaxes('short') }, ) ) .addRow( @@ -141,7 +179,14 @@ local g = import 'grafana-builder/grafana.libsonnet'; g.row('Disk') .addPanel( g.panel('Disk Utilisation') + - g.queryPanel('1 - sum(max by (device, node) (node_filesystem_free_bytes{fstype=~"ext[24]"})) / sum(max by (device, node) (node_filesystem_size_bytes{fstype=~"ext[24]"}))', 'Disk') + + g.queryPanel(||| + 1 - + ( + sum(max without (mountpoint, fstype) (node_filesystem_avail_bytes{fstype=~"ext[24]"})) + / + sum(max without (mountpoint, fstype) (node_filesystem_size_bytes{fstype=~"ext[24]"})) + ) + |||, 'Disk') + { yaxes: g.yaxes('percentunit') }, ), ), diff --git a/docs/node-mixin/rules/rules.libsonnet b/docs/node-mixin/rules/rules.libsonnet index 27636aa8..6bd39a5f 100644 --- a/docs/node-mixin/rules/rules.libsonnet +++ b/docs/node-mixin/rules/rules.libsonnet @@ -8,8 +8,8 @@ // This rule gives the number of CPUs per node. record: 'instance:node_num_cpu:sum', expr: ||| - count by (instance) ( - sum by (instance, cpu) ( + count without (cpu) ( + sum without (mode) ( node_cpu_seconds_total{%(nodeExporterSelector)s} ) ) @@ -19,29 +19,20 @@ // CPU utilisation is % CPU is not idle. record: 'instance:node_cpu_utilisation:avg1m', expr: ||| - 1 - avg by (instance) ( - rate(node_cpu_seconds_total{%(nodeExporterSelector)s,mode="idle"}[1m]) + 1 - avg without (cpu, mode) ( + rate(node_cpu_seconds_total{%(nodeExporterSelector)s, mode="idle"}[1m]) ) ||| % $._config, }, { - // CPU saturation is 1min avg run queue length / number of CPUs. - // Can go over 100%. >100% is bad. - record: 'instance:node_cpu_saturation_load1:', + // This is CPU saturation: 1min avg run queue length / number of CPUs. + // Can go over 1. >1 is bad. + record: 'instance:node_load1_per_cpu:ratio', expr: ||| ( - sum by (instance) (node_load1{%(nodeExporterSelector)s}) + node_load1{%(nodeExporterSelector)s} / - instance:node_num_cpu:sum - ) - ||| % $._config, - }, - { - // Total memory per node - record: 'instance:node_memory_bytes_total:sum', - expr: ||| - sum by (instance) ( - node_memory_MemTotal_bytes{%(nodeExporterSelector)s} + instance:node_num_cpu:sum{%(nodeExporterSelector)s} ) ||| % $._config, }, @@ -57,9 +48,9 @@ ||| % $._config, }, { - record: 'instance:node_memory_swap_io_bytes:sum_rate', + record: 'instance:node_memory_swap_io_pages:sum_rate', expr: ||| - 1e3 * sum by (instance) ( + ( rate(node_vmstat_pgpgin{%(nodeExporterSelector)s}[1m]) + rate(node_vmstat_pgpgout{%(nodeExporterSelector)s}[1m]) @@ -70,7 +61,7 @@ // Disk utilisation (ms spent, 1 second irate()) record: 'instance:node_disk_utilisation:sum_irate', expr: ||| - sum by (instance) ( + sum without (device) ( irate(node_disk_io_time_seconds_total{%(nodeExporterSelector)s,device=~"(sd|xvd).+"}[1m]) ) ||| % $._config, @@ -79,28 +70,30 @@ // Disk saturation (ms spent, by rate() it's bound by 1 second) record: 'instance:node_disk_saturation:sum_irate', expr: ||| - sum by (instance) ( + sum without (device) ( irate(node_disk_io_time_weighted_seconds_total{%(nodeExporterSelector)s,device=~"(sd|xvd).+"}[1m]) ) ||| % $._config, }, + // TODO: For the following two rules, consider configurable filtering to exclude more network + // device names than just "lo". { record: 'instance:node_net_utilisation:sum_irate', expr: ||| - sum by (instance) ( - irate(node_network_receive_bytes_total{%(nodeExporterSelector)s,device=~"eth[0-9]+"}[1m]) + sum without (device) ( + irate(node_network_receive_bytes_total{%(nodeExporterSelector)s, device!="lo"}[1m]) + - irate(node_network_transmit_bytes_total{%(nodeExporterSelector)s,device=~"eth[0-9]+"}[1m]) + irate(node_network_transmit_bytes_total{%(nodeExporterSelector)s, device!="lo"}[1m]) ) ||| % $._config, }, { record: 'instance:node_net_saturation:sum_irate', expr: ||| - sum by (instance) ( - irate(node_network_receive_drop_total{%(nodeExporterSelector)s,device=~"eth[0-9]+"}[1m]) + sum without (device) ( + irate(node_network_receive_drop_total{%(nodeExporterSelector)s, device!="lo"}[1m]) + - irate(node_network_transmit_drop_total{%(nodeExporterSelector)s,device=~"eth[0-9]+"}[1m]) + irate(node_network_transmit_drop_total{%(nodeExporterSelector)s, device!="lo"}[1m]) ) ||| % $._config, },