Address first batch of old review comments

Signed-off-by: beorn7 <beorn@grafana.com>
This commit is contained in:
beorn7 2019-07-12 22:58:43 +02:00
parent b3b47f2d07
commit 2180c2f3bf
3 changed files with 105 additions and 65 deletions

View File

@ -21,22 +21,23 @@ local gauge = promgrafonnet.gauge;
)
.addTarget(prometheus.target(
|||
1 - avg by (cpu) (irate(node_cpu{%(nodeExporterSelector)s, mode="idle", instance="$instance"}[1m]))
1 - avg by (cpu) (irate(node_cpu_seconds_total{%(nodeExporterSelector)s, mode="idle", instance="$instance"}[1m]))
||| % $._config,
legendFormat='{{cpu}}',
intervalFactor=10,
));
// TODO: Is this panel useful?
local systemLoad =
graphPanel.new(
'System load',
'Load Average',
datasource='$datasource',
span=6,
format='percentunit',
format='short',
)
.addTarget(prometheus.target('node_load1{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='load 1m'))
.addTarget(prometheus.target('node_load5{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='load 5m'))
.addTarget(prometheus.target('node_load15{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='load 15m'));
.addTarget(prometheus.target('node_load1{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='1m load average'))
.addTarget(prometheus.target('node_load5{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='5m load average'))
.addTarget(prometheus.target('node_load15{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='15m load average'));
local memoryGraph =
graphPanel.new(
@ -48,27 +49,27 @@ local gauge = promgrafonnet.gauge;
.addTarget(prometheus.target(
|||
(
node_memory_MemTotal{%(nodeExporterSelector)s, instance="$instance"}
node_memory_MemTotal_bytes{%(nodeExporterSelector)s, instance="$instance"}
-
node_memory_MemFree{%(nodeExporterSelector)s, instance="$instance"}
node_memory_MemFree_bytes{%(nodeExporterSelector)s, instance="$instance"}
-
node_memory_Buffers{%(nodeExporterSelector)s, instance="$instance"}
node_memory_Buffers_bytes{%(nodeExporterSelector)s, instance="$instance"}
-
node_memory_Cached{%(nodeExporterSelector)s, instance="$instance"}
node_memory_Cached_bytes{%(nodeExporterSelector)s, instance="$instance"}
)
||| % $._config, legendFormat='memory used'
))
.addTarget(prometheus.target('node_memory_Buffers{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='memory buffers'))
.addTarget(prometheus.target('node_memory_Cached{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='memory cached'))
.addTarget(prometheus.target('node_memory_MemFree{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='memory free'));
.addTarget(prometheus.target('node_memory_Buffers_bytes{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='memory buffers'))
.addTarget(prometheus.target('node_memory_Cached_bytes{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='memory cached'))
.addTarget(prometheus.target('node_memory_MemFree_bytes{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='memory free'));
local memoryGauge = gauge.new(
'Memory Usage',
|||
(
node_memory_MemAvailable{%(nodeExporterSelector)s, instance="$instance"}
node_memory_MemAvailable_bytes{%(nodeExporterSelector)s, instance="$instance"}
/
node_memory_MemTotal{%(nodeExporterSelector)s, instance="$instance"}
node_memory_MemTotal_bytes{%(nodeExporterSelector)s, instance="$instance"}
)
* 100
||| % $._config,
@ -80,9 +81,9 @@ local gauge = promgrafonnet.gauge;
datasource='$datasource',
span=9,
)
.addTarget(prometheus.target('sum by (instance) (irate(node_disk_bytes_read_total{%(nodeExporterSelector)s, instance="$instance"}[1m]))' % $._config, legendFormat='read'))
.addTarget(prometheus.target('sum by (instance) (irate(node_disk_bytes_written_total{%(nodeExporterSelector)s, instance="$instance"}[1m]))' % $._config, legendFormat='written'))
.addTarget(prometheus.target('sum by (instance) (irate(node_disk_io_time_ms{%(nodeExporterSelector)s, instance="$instance"}[1m]))' % $._config, legendFormat='io time')) +
.addTarget(prometheus.target('sum by (instance) (irate(node_disk_read_bytes_total{%(nodeExporterSelector)s, instance="$instance"}[1m]))' % $._config, legendFormat='read'))
.addTarget(prometheus.target('sum by (instance) (irate(node_disk_written_bytes_total{%(nodeExporterSelector)s, instance="$instance"}[1m]))' % $._config, legendFormat='written'))
.addTarget(prometheus.target('sum by (instance) (irate(node_disk_io_time_seconds_total{%(nodeExporterSelector)s, instance="$instance"}[1m]))' % $._config, legendFormat='io time')) +
{
seriesOverrides: [
{
@ -96,18 +97,19 @@ local gauge = promgrafonnet.gauge;
],
yaxes: [
self.yaxe(format='bytes'),
self.yaxe(format='ms'),
self.yaxe(format='s'),
],
};
// TODO: Should this be partitioned by mountpoint?
local diskSpaceUsage = gauge.new(
'Disk Space Usage',
|||
100 -
(
sum(node_filesystem_free{%(nodeExporterSelector)s, device!="rootfs", instance="$instance"}
sum(node_filesystem_avail_bytes{%(nodeExporterSelector)s, %(fsSelector)s, instance="$instance"}
/
sum(node_filesystem_size{%(nodeExporterSelector)s, device!="rootfs", instance="$instance"}
sum(node_filesystem_size_bytes{%(nodeExporterSelector)s, %(fsSelector)s, instance="$instance"}
* 100
)
||| % $._config,
@ -120,7 +122,7 @@ local gauge = promgrafonnet.gauge;
span=6,
format='bytes',
)
.addTarget(prometheus.target('irate(node_network_receive_bytes{%(nodeExporterSelector)s, instance="$instance", device!~"lo"}[1m])' % $._config, legendFormat='{{device}}'));
.addTarget(prometheus.target('irate(node_network_receive_bytes_total{%(nodeExporterSelector)s, instance="$instance", device!="lo"}[1m])' % $._config, legendFormat='{{device}}'));
local networkTransmitted =
graphPanel.new(
@ -129,7 +131,7 @@ local gauge = promgrafonnet.gauge;
span=6,
format='bytes',
)
.addTarget(prometheus.target('irate(node_network_transmit_bytes{%(nodeExporterSelector)s, instance="$instance", device!~"lo"}[1m])' % $._config, legendFormat='{{device}}'));
.addTarget(prometheus.target('irate(node_network_transmit_bytes_total{%(nodeExporterSelector)s, instance="$instance", device!="lo"}[1m])' % $._config, legendFormat='{{device}}'));
dashboard.new('Nodes', time_from='now-1h')
.addTemplate(
@ -152,7 +154,7 @@ local gauge = promgrafonnet.gauge;
template.new(
'instance',
'$datasource',
'label_values(node_boot_time{%(nodeExporterSelector)s}, instance)' % $._config,
'label_values(node_boot_time_seconds{%(nodeExporterSelector)s}, instance)' % $._config,
refresh='time',
)
)

View File

@ -10,16 +10,30 @@ local g = import 'grafana-builder/grafana.libsonnet';
g.row('CPU')
.addPanel(
g.panel('CPU Utilisation') +
g.queryPanel('instance:node_cpu_utilisation:avg1m * instance:node_num_cpu:sum / scalar(sum(instance:node_num_cpu:sum))', '{{instance}}', legendLink) +
g.queryPanel(|||
(
instance:node_cpu_utilisation:avg1m
*
instance:node_num_cpu:sum
/ ignoring (instance) group_left
sum without (instance) (instance:node_num_cpu:sum)
)
|||, '{{instance}}', legendLink) +
g.stack +
{ yaxes: g.yaxes({ format: 'percentunit', max: 1 }) },
)
.addPanel(
g.panel('CPU Saturation (Load1)') +
// TODO: Is this a useful panel?
g.panel('CPU Saturation (load1 per CPU)') +
g.queryPanel(|||
instance:node_cpu_saturation_load1: / scalar(sum(up{%(nodeExporterSelector)s}))
||| % $._config, '{{instance}}', legendLink) +
(
instance:node_load1_per_cpu:ratio
/ ignoring (instance) group_left
count without (instance) (instance:node_load1_per_cpu:ratio)
)
|||, '{{instance}}', legendLink) +
g.stack +
// TODO: Does `max: 1` make sense? The stack can go over 1 in high-load scenarios.
{ yaxes: g.yaxes({ format: 'percentunit', max: 1 }) },
)
)
@ -43,16 +57,26 @@ local g = import 'grafana-builder/grafana.libsonnet';
.addPanel(
g.panel('Disk IO Utilisation') +
// Full utilisation would be all disks on each node spending an average of
// 1 sec per second doing I/O, normalize by node count for stacked charts
g.queryPanel('instance:node_disk_utilisation:sum_irate / scalar(sum(up{%(nodeExporterSelector)s}))' % $._config, '{{instance}}', legendLink) +
// 1 second per second doing I/O, normalize by metric cardinality for stacked charts.
g.queryPanel(|||
(
instance:node_disk_utilisation:sum_irate
/ ignoring (instance) group_left
count without (instance) (instance:node_disk_utilisation:sum_irate)
)
|||, '{{instance}}', legendLink) +
g.stack +
{ yaxes: g.yaxes({ format: 'percentunit', max: 1 }) },
)
.addPanel(
g.panel('Disk IO Saturation') +
g.queryPanel(|||
instance:node_disk_saturation:sum_irate / scalar(sum(up{%(nodeExporterSelector)s}))
||| % $._config, '{{instance}}', legendLink) +
(
instance:node_disk_saturation:sum_irate
/ ignoring (instance) group_left
count without (instance) (instance:node_disk_saturation:sum_irate)
)
|||, '{{instance}}', legendLink) +
g.stack +
{ yaxes: g.yaxes({ format: 'percentunit', max: 1 }) },
)
@ -76,7 +100,21 @@ local g = import 'grafana-builder/grafana.libsonnet';
g.row('Storage')
.addPanel(
g.panel('Disk Capacity') +
g.queryPanel('sum(max(node_filesystem_size_bytes{fstype=~"ext[24]"} - node_filesystem_free_bytes{fstype=~"ext[24]"}) by (device,instance,namespace)) by (instance,namespace) / scalar(sum(max(node_filesystem_size_bytes{fstype=~"ext[24]"}) by (device,instance,namespace)))', '{{instance}}', legendLink) +
g.queryPanel(|||
(
sum without (device) (
max without (fstype, mountpoint) (
node_filesystem_size_bytes{fstype=~"ext[24]"} - node_filesystem_avail_bytes{fstype=~"ext[24]"}
)
)
/ ignoring (instance) group_left
sum without (instance, device) (
max without (fstype, mountpoint) (
node_filesystem_size_bytes{fstype=~"ext[24]"}
)
)
)
|||, '{{instance}}', legendLink) +
g.stack +
{ yaxes: g.yaxes({ format: 'percentunit', max: 1 }) },
),
@ -106,9 +144,9 @@ local g = import 'grafana-builder/grafana.libsonnet';
{ yaxes: g.yaxes('percentunit') },
)
.addPanel(
g.panel('Memory Saturation (Swap I/O)') +
g.queryPanel('instance:node_memory_swap_io_bytes:sum_rate{instance="$instance"}', 'Swap IO') +
{ yaxes: g.yaxes('Bps') },
g.panel('Memory Saturation (pages swapped per second)') +
g.queryPanel('instance:node_memory_swap_io_pages:sum_rate{instance="$instance"}', 'Swap IO') +
{ yaxes: g.yaxes('short') },
)
)
.addRow(
@ -141,7 +179,14 @@ local g = import 'grafana-builder/grafana.libsonnet';
g.row('Disk')
.addPanel(
g.panel('Disk Utilisation') +
g.queryPanel('1 - sum(max by (device, node) (node_filesystem_free_bytes{fstype=~"ext[24]"})) / sum(max by (device, node) (node_filesystem_size_bytes{fstype=~"ext[24]"}))', 'Disk') +
g.queryPanel(|||
1 -
(
sum(max without (mountpoint, fstype) (node_filesystem_avail_bytes{fstype=~"ext[24]"}))
/
sum(max without (mountpoint, fstype) (node_filesystem_size_bytes{fstype=~"ext[24]"}))
)
|||, 'Disk') +
{ yaxes: g.yaxes('percentunit') },
),
),

View File

@ -8,8 +8,8 @@
// This rule gives the number of CPUs per node.
record: 'instance:node_num_cpu:sum',
expr: |||
count by (instance) (
sum by (instance, cpu) (
count without (cpu) (
sum without (mode) (
node_cpu_seconds_total{%(nodeExporterSelector)s}
)
)
@ -19,29 +19,20 @@
// CPU utilisation is % CPU is not idle.
record: 'instance:node_cpu_utilisation:avg1m',
expr: |||
1 - avg by (instance) (
1 - avg without (cpu, mode) (
rate(node_cpu_seconds_total{%(nodeExporterSelector)s, mode="idle"}[1m])
)
||| % $._config,
},
{
// CPU saturation is 1min avg run queue length / number of CPUs.
// Can go over 100%. >100% is bad.
record: 'instance:node_cpu_saturation_load1:',
// This is CPU saturation: 1min avg run queue length / number of CPUs.
// Can go over 1. >1 is bad.
record: 'instance:node_load1_per_cpu:ratio',
expr: |||
(
sum by (instance) (node_load1{%(nodeExporterSelector)s})
node_load1{%(nodeExporterSelector)s}
/
instance:node_num_cpu:sum
)
||| % $._config,
},
{
// Total memory per node
record: 'instance:node_memory_bytes_total:sum',
expr: |||
sum by (instance) (
node_memory_MemTotal_bytes{%(nodeExporterSelector)s}
instance:node_num_cpu:sum{%(nodeExporterSelector)s}
)
||| % $._config,
},
@ -57,9 +48,9 @@
||| % $._config,
},
{
record: 'instance:node_memory_swap_io_bytes:sum_rate',
record: 'instance:node_memory_swap_io_pages:sum_rate',
expr: |||
1e3 * sum by (instance) (
(
rate(node_vmstat_pgpgin{%(nodeExporterSelector)s}[1m])
+
rate(node_vmstat_pgpgout{%(nodeExporterSelector)s}[1m])
@ -70,7 +61,7 @@
// Disk utilisation (ms spent, 1 second irate())
record: 'instance:node_disk_utilisation:sum_irate',
expr: |||
sum by (instance) (
sum without (device) (
irate(node_disk_io_time_seconds_total{%(nodeExporterSelector)s,device=~"(sd|xvd).+"}[1m])
)
||| % $._config,
@ -79,28 +70,30 @@
// Disk saturation (ms spent, by rate() it's bound by 1 second)
record: 'instance:node_disk_saturation:sum_irate',
expr: |||
sum by (instance) (
sum without (device) (
irate(node_disk_io_time_weighted_seconds_total{%(nodeExporterSelector)s,device=~"(sd|xvd).+"}[1m])
)
||| % $._config,
},
// TODO: For the following two rules, consider configurable filtering to exclude more network
// device names than just "lo".
{
record: 'instance:node_net_utilisation:sum_irate',
expr: |||
sum by (instance) (
irate(node_network_receive_bytes_total{%(nodeExporterSelector)s,device=~"eth[0-9]+"}[1m])
sum without (device) (
irate(node_network_receive_bytes_total{%(nodeExporterSelector)s, device!="lo"}[1m])
+
irate(node_network_transmit_bytes_total{%(nodeExporterSelector)s,device=~"eth[0-9]+"}[1m])
irate(node_network_transmit_bytes_total{%(nodeExporterSelector)s, device!="lo"}[1m])
)
||| % $._config,
},
{
record: 'instance:node_net_saturation:sum_irate',
expr: |||
sum by (instance) (
irate(node_network_receive_drop_total{%(nodeExporterSelector)s,device=~"eth[0-9]+"}[1m])
sum without (device) (
irate(node_network_receive_drop_total{%(nodeExporterSelector)s, device!="lo"}[1m])
+
irate(node_network_transmit_drop_total{%(nodeExporterSelector)s,device=~"eth[0-9]+"}[1m])
irate(node_network_transmit_drop_total{%(nodeExporterSelector)s, device!="lo"}[1m])
)
||| % $._config,
},