From 135710d8b511ed6e194891cf53c2cbedc6c09abd Mon Sep 17 00:00:00 2001 From: Marcus Cobden <52370+leth@users.noreply.github.com> Date: Sat, 24 Sep 2022 07:25:15 +0100 Subject: [PATCH] Add btrfs device error stats (#2193) * Improve metrics filesystem scanning logic * Makes ioctl syscalls to load the device error stats. * Adds filesystem mountpoint labels to existing metrics for ease of use. Signed-off-by: Marcus Cobden --- collector/btrfs_linux.go | 259 +++++++++++++++++++++++++++++++--- collector/btrfs_linux_test.go | 14 +- go.mod | 1 + go.sum | 4 + 4 files changed, 251 insertions(+), 27 deletions(-) diff --git a/collector/btrfs_linux.go b/collector/btrfs_linux.go index 01971c7c..bb1820ff 100644 --- a/collector/btrfs_linux.go +++ b/collector/btrfs_linux.go @@ -18,8 +18,13 @@ package collector import ( "fmt" + "path" + "strings" + "syscall" + dennwc "github.com/dennwc/btrfs" "github.com/go-kit/log" + "github.com/go-kit/log/level" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/procfs/btrfs" ) @@ -52,19 +57,163 @@ func NewBtrfsCollector(logger log.Logger) (Collector, error) { func (c *btrfsCollector) Update(ch chan<- prometheus.Metric) error { stats, err := c.fs.Stats() if err != nil { - return fmt.Errorf("failed to retrieve Btrfs stats: %w", err) + return fmt.Errorf("failed to retrieve Btrfs stats from procfs: %w", err) + } + + ioctlStatsMap, err := c.getIoctlStats() + if err != nil { + level.Debug(c.logger).Log( + "msg", "Error querying btrfs device stats with ioctl", + "err", err) + ioctlStatsMap = make(map[string]*btrfsIoctlFsStats) } for _, s := range stats { - c.updateBtrfsStats(ch, s) + // match up procfs and ioctl info by filesystem UUID (without dashes) + var fsUUID = strings.Replace(s.UUID, "-", "", -1) + ioctlStats := ioctlStatsMap[fsUUID] + c.updateBtrfsStats(ch, s, ioctlStats) } return nil } +type btrfsIoctlFsDevStats struct { + path string + uuid string + + bytesUsed uint64 + totalBytes uint64 + + // The error stats below match the following upstream lists: + // https://github.com/dennwc/btrfs/blob/b3db0b2dedac3bf580f412034d77e0bf4b420167/btrfs.go#L132-L140 + // https://github.com/torvalds/linux/blob/70d605cbeecb408dd884b1f0cd3963eeeaac144c/include/uapi/linux/btrfs.h#L680-L692 + writeErrs uint64 + readErrs uint64 + flushErrs uint64 + corruptionErrs uint64 + generationErrs uint64 +} + +type btrfsIoctlFsStats struct { + uuid string + devices []btrfsIoctlFsDevStats +} + +func (c *btrfsCollector) getIoctlStats() (map[string]*btrfsIoctlFsStats, error) { + // Instead of introducing more ioctl calls to scan for all btrfs + // filesytems re-use our mount point utils to find known mounts + mountsList, err := mountPointDetails(c.logger) + if err != nil { + return nil, err + } + + // track devices we have successfully scanned, by device path + devicesDone := make(map[string]struct{}) + // filesystems scann results by UUID + fsStats := make(map[string]*btrfsIoctlFsStats) + + for _, mount := range mountsList { + if mount.fsType != "btrfs" { + continue + } + + if _, found := devicesDone[mount.device]; found { + // We already found this filesystem by another mount point + continue + } + + fs, err := dennwc.Open(mount.mountPoint, true) + if err != nil { + // failed to open this mount point, maybe we didn't have permission + // maybe we'll find another mount point for this FS later + level.Debug(c.logger).Log( + "msg", "Error inspecting btrfs mountpoint", + "mountPoint", mount.mountPoint, + "err", err) + continue + } + + fsInfo, err := fs.Info() + if err != nil { + // Failed to get the FS info for some reason, + // perhaps it'll work with a different mount point + level.Debug(c.logger).Log( + "msg", "Error querying btrfs filesystem", + "mountPoint", mount.mountPoint, + "err", err) + continue + } + + fsID := fsInfo.FSID.String() + if _, found := fsStats[fsID]; found { + // We already found this filesystem by another mount point + continue + } + + deviceStats, err := c.getIoctlDeviceStats(fs, &fsInfo) + if err != nil { + level.Debug(c.logger).Log( + "msg", "Error querying btrfs device stats", + "mountPoint", mount.mountPoint, + "err", err) + continue + } + + devicesDone[mount.device] = struct{}{} + fsStats[fsID] = &btrfsIoctlFsStats{ + uuid: fsID, + devices: deviceStats, + } + } + + return fsStats, nil +} + +func (c *btrfsCollector) getIoctlDeviceStats(fs *dennwc.FS, fsInfo *dennwc.Info) ([]btrfsIoctlFsDevStats, error) { + devices := make([]btrfsIoctlFsDevStats, 0, fsInfo.NumDevices) + + for i := uint64(0); i <= fsInfo.MaxID; i++ { + deviceInfo, err := fs.GetDevInfo(i) + + if err != nil { + if errno, ok := err.(syscall.Errno); ok && errno == syscall.ENODEV { + // device IDs do not consistently start at 0, nor are ranges contiguous, so we expect this + continue + } + return nil, err + } + + deviceStats, err := fs.GetDevStats(i) + if err != nil { + return nil, err + } + + devices = append(devices, btrfsIoctlFsDevStats{ + path: deviceInfo.Path, + uuid: deviceInfo.UUID.String(), + bytesUsed: deviceInfo.BytesUsed, + totalBytes: deviceInfo.TotalBytes, + + writeErrs: deviceStats.WriteErrs, + readErrs: deviceStats.ReadErrs, + flushErrs: deviceStats.FlushErrs, + corruptionErrs: deviceStats.CorruptionErrs, + generationErrs: deviceStats.GenerationErrs, + }) + + if uint64(len(devices)) == fsInfo.NumDevices { + break + } + } + + return devices, nil +} + // btrfsMetric represents a single Btrfs metric that is converted into a Prometheus Metric. type btrfsMetric struct { name string + metricType prometheus.ValueType desc string value float64 extraLabel []string @@ -72,14 +221,14 @@ type btrfsMetric struct { } // updateBtrfsStats collects statistics for one bcache ID. -func (c *btrfsCollector) updateBtrfsStats(ch chan<- prometheus.Metric, s *btrfs.Stats) { +func (c *btrfsCollector) updateBtrfsStats(ch chan<- prometheus.Metric, s *btrfs.Stats, ioctlStats *btrfsIoctlFsStats) { const subsystem = "btrfs" // Basic information about the filesystem. devLabels := []string{"uuid"} // Retrieve the metrics. - metrics := c.getMetrics(s) + metrics := c.getMetrics(s, ioctlStats) // Convert all gathered metrics to Prometheus Metrics and add to channel. for _, m := range metrics { @@ -99,7 +248,7 @@ func (c *btrfsCollector) updateBtrfsStats(ch chan<- prometheus.Metric, s *btrfs. ch <- prometheus.MustNewConstMetric( desc, - prometheus.GaugeValue, + m.metricType, m.value, labelValues..., ) @@ -107,38 +256,104 @@ func (c *btrfsCollector) updateBtrfsStats(ch chan<- prometheus.Metric, s *btrfs. } // getMetrics returns metrics for the given Btrfs statistics. -func (c *btrfsCollector) getMetrics(s *btrfs.Stats) []btrfsMetric { +func (c *btrfsCollector) getMetrics(s *btrfs.Stats, ioctlStats *btrfsIoctlFsStats) []btrfsMetric { metrics := []btrfsMetric{ { name: "info", desc: "Filesystem information", value: 1, + metricType: prometheus.GaugeValue, extraLabel: []string{"label"}, extraLabelValue: []string{s.Label}, }, { - name: "global_rsv_size_bytes", - desc: "Size of global reserve.", - value: float64(s.Allocation.GlobalRsvSize), + name: "global_rsv_size_bytes", + desc: "Size of global reserve.", + metricType: prometheus.GaugeValue, + value: float64(s.Allocation.GlobalRsvSize), }, } - // Information about devices. - for n, dev := range s.Devices { - metrics = append(metrics, btrfsMetric{ - name: "device_size_bytes", - desc: "Size of a device that is part of the filesystem.", - value: float64(dev.Size), - extraLabel: []string{"device"}, - extraLabelValue: []string{n}, - }) - } - // Information about data, metadata and system data. metrics = append(metrics, c.getAllocationStats("data", s.Allocation.Data)...) metrics = append(metrics, c.getAllocationStats("metadata", s.Allocation.Metadata)...) metrics = append(metrics, c.getAllocationStats("system", s.Allocation.System)...) + // Information about devices. + if ioctlStats == nil { + for n, dev := range s.Devices { + metrics = append(metrics, btrfsMetric{ + name: "device_size_bytes", + desc: "Size of a device that is part of the filesystem.", + metricType: prometheus.GaugeValue, + value: float64(dev.Size), + extraLabel: []string{"device"}, + extraLabelValue: []string{n}, + }) + } + return metrics + } + + for _, dev := range ioctlStats.devices { + // trim the path prefix from the device name so the value should match + // the value used in the fallback branch above. + // e.g. /dev/sda -> sda, /rootfs/dev/md1 -> md1 + _, device := path.Split(dev.path) + + extraLabels := []string{"device", "btrfs_dev_uuid"} + extraLabelValues := []string{device, dev.uuid} + + metrics = append(metrics, + btrfsMetric{ + name: "device_size_bytes", + desc: "Size of a device that is part of the filesystem.", + metricType: prometheus.GaugeValue, + value: float64(dev.totalBytes), + extraLabel: extraLabels, + extraLabelValue: extraLabelValues, + }, + // A bytes available metric is probably more useful than a + // bytes used metric, because large numbers of bytes will + // suffer from floating point representation issues + // and we probably care more about the number when it's low anyway + btrfsMetric{ + name: "device_unused_bytes", + desc: "Unused bytes unused on a device that is part of the filesystem.", + metricType: prometheus.GaugeValue, + value: float64(dev.totalBytes - dev.bytesUsed), + extraLabel: extraLabels, + extraLabelValue: extraLabelValues, + }) + + errorLabels := append([]string{"type"}, extraLabels...) + values := []uint64{ + dev.writeErrs, + dev.readErrs, + dev.flushErrs, + dev.corruptionErrs, + dev.generationErrs, + } + btrfsErrorTypeNames := []string{ + "write", + "read", + "flush", + "corruption", + "generation", + } + + for i, errorType := range btrfsErrorTypeNames { + metrics = append(metrics, + btrfsMetric{ + name: "device_errors_total", + desc: "Errors reported for the device", + metricType: prometheus.CounterValue, + value: float64(values[i]), + extraLabel: errorLabels, + extraLabelValue: append([]string{errorType}, extraLabelValues...), + }) + } + } + return metrics } @@ -148,6 +363,7 @@ func (c *btrfsCollector) getAllocationStats(a string, s *btrfs.AllocationStats) { name: "reserved_bytes", desc: "Amount of space reserved for a data type", + metricType: prometheus.GaugeValue, value: float64(s.ReservedBytes), extraLabel: []string{"block_group_type"}, extraLabelValue: []string{a}, @@ -168,6 +384,7 @@ func (c *btrfsCollector) getLayoutStats(a, l string, s *btrfs.LayoutUsage) []btr { name: "used_bytes", desc: "Amount of used space by a layout/data type", + metricType: prometheus.GaugeValue, value: float64(s.UsedBytes), extraLabel: []string{"block_group_type", "mode"}, extraLabelValue: []string{a, l}, @@ -175,6 +392,7 @@ func (c *btrfsCollector) getLayoutStats(a, l string, s *btrfs.LayoutUsage) []btr { name: "size_bytes", desc: "Amount of space allocated for a layout/data type", + metricType: prometheus.GaugeValue, value: float64(s.TotalBytes), extraLabel: []string{"block_group_type", "mode"}, extraLabelValue: []string{a, l}, @@ -182,6 +400,7 @@ func (c *btrfsCollector) getLayoutStats(a, l string, s *btrfs.LayoutUsage) []btr { name: "allocation_ratio", desc: "Data allocation ratio for a layout/data type", + metricType: prometheus.GaugeValue, value: s.Ratio, extraLabel: []string{"block_group_type", "mode"}, extraLabelValue: []string{a, l}, diff --git a/collector/btrfs_linux_test.go b/collector/btrfs_linux_test.go index 7ce19aaf..fcd6f9f9 100644 --- a/collector/btrfs_linux_test.go +++ b/collector/btrfs_linux_test.go @@ -27,8 +27,6 @@ var expectedBtrfsMetrics = [][]btrfsMetric{ { {name: "info", value: 1, extraLabel: []string{"label"}, extraLabelValue: []string{"fixture"}}, {name: "global_rsv_size_bytes", value: 1.6777216e+07}, - {name: "device_size_bytes", value: 1.073741824e+10, extraLabel: []string{"device"}, extraLabelValue: []string{"loop25"}}, - {name: "device_size_bytes", value: 1.073741824e+10, extraLabel: []string{"device"}, extraLabelValue: []string{"loop26"}}, {name: "reserved_bytes", value: 0, extraLabel: []string{"block_group_type"}, extraLabelValue: []string{"data"}}, {name: "used_bytes", value: 8.08189952e+08, extraLabel: []string{"block_group_type", "mode"}, extraLabelValue: []string{"data", "raid0"}}, {name: "size_bytes", value: 2.147483648e+09, extraLabel: []string{"block_group_type", "mode"}, extraLabelValue: []string{"data", "raid0"}}, @@ -41,14 +39,12 @@ var expectedBtrfsMetrics = [][]btrfsMetric{ {name: "used_bytes", value: 16384, extraLabel: []string{"block_group_type", "mode"}, extraLabelValue: []string{"system", "raid1"}}, {name: "size_bytes", value: 8.388608e+06, extraLabel: []string{"block_group_type", "mode"}, extraLabelValue: []string{"system", "raid1"}}, {name: "allocation_ratio", value: 2, extraLabel: []string{"block_group_type", "mode"}, extraLabelValue: []string{"system", "raid1"}}, + {name: "device_size_bytes", value: 1.073741824e+10, extraLabel: []string{"device"}, extraLabelValue: []string{"loop25"}}, + {name: "device_size_bytes", value: 1.073741824e+10, extraLabel: []string{"device"}, extraLabelValue: []string{"loop26"}}, }, { {name: "info", value: 1, extraLabel: []string{"label"}, extraLabelValue: []string{""}}, {name: "global_rsv_size_bytes", value: 1.6777216e+07}, - {name: "device_size_bytes", value: 1.073741824e+10, extraLabel: []string{"device"}, extraLabelValue: []string{"loop22"}}, - {name: "device_size_bytes", value: 1.073741824e+10, extraLabel: []string{"device"}, extraLabelValue: []string{"loop23"}}, - {name: "device_size_bytes", value: 1.073741824e+10, extraLabel: []string{"device"}, extraLabelValue: []string{"loop24"}}, - {name: "device_size_bytes", value: 1.073741824e+10, extraLabel: []string{"device"}, extraLabelValue: []string{"loop25"}}, {name: "reserved_bytes", value: 0, extraLabel: []string{"block_group_type"}, extraLabelValue: []string{"data"}}, {name: "used_bytes", value: 0, extraLabel: []string{"block_group_type", "mode"}, extraLabelValue: []string{"data", "raid5"}}, {name: "size_bytes", value: 6.44087808e+08, extraLabel: []string{"block_group_type", "mode"}, extraLabelValue: []string{"data", "raid5"}}, @@ -61,6 +57,10 @@ var expectedBtrfsMetrics = [][]btrfsMetric{ {name: "used_bytes", value: 16384, extraLabel: []string{"block_group_type", "mode"}, extraLabelValue: []string{"system", "raid6"}}, {name: "size_bytes", value: 1.6777216e+07, extraLabel: []string{"block_group_type", "mode"}, extraLabelValue: []string{"system", "raid6"}}, {name: "allocation_ratio", value: 2, extraLabel: []string{"block_group_type", "mode"}, extraLabelValue: []string{"system", "raid6"}}, + {name: "device_size_bytes", value: 1.073741824e+10, extraLabel: []string{"device"}, extraLabelValue: []string{"loop22"}}, + {name: "device_size_bytes", value: 1.073741824e+10, extraLabel: []string{"device"}, extraLabelValue: []string{"loop23"}}, + {name: "device_size_bytes", value: 1.073741824e+10, extraLabel: []string{"device"}, extraLabelValue: []string{"loop24"}}, + {name: "device_size_bytes", value: 1.073741824e+10, extraLabel: []string{"device"}, extraLabelValue: []string{"loop25"}}, }, } @@ -104,7 +104,7 @@ func TestBtrfs(t *testing.T) { } for i, s := range stats { - metrics := collector.getMetrics(s) + metrics := collector.getMetrics(s, nil) if len(metrics) != len(expectedBtrfsMetrics[i]) { t.Fatalf("Unexpected number of Btrfs metrics: expected %v, got %v", len(expectedBtrfsMetrics[i]), len(metrics)) } diff --git a/go.mod b/go.mod index b0a035f5..b4e7a671 100644 --- a/go.mod +++ b/go.mod @@ -5,6 +5,7 @@ go 1.17 require ( github.com/beevik/ntp v0.3.0 github.com/coreos/go-systemd/v22 v22.3.2 + github.com/dennwc/btrfs v0.0.0-20220403080356-b3db0b2dedac github.com/ema/qdisc v0.0.0-20200603082823-62d0308e3e00 github.com/go-kit/log v0.2.1 github.com/godbus/dbus/v5 v5.1.0 diff --git a/go.sum b/go.sum index 668b4940..4c22c4cb 100644 --- a/go.sum +++ b/go.sum @@ -63,6 +63,10 @@ github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ3 github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/dennwc/btrfs v0.0.0-20220403080356-b3db0b2dedac h1:cjfIEKdg/lZ+ewJk8FYoYZTN2HY/okVSPtmHLaYMAvE= +github.com/dennwc/btrfs v0.0.0-20220403080356-b3db0b2dedac/go.mod h1:MYsOV9Dgsec3FFSOjywi0QK5r6TeBbdWxdrMGtiYXHA= +github.com/dennwc/ioctl v1.0.0 h1:DsWAAjIxRqNcLn9x6mwfuf2pet3iB7aK90K4tF16rLg= +github.com/dennwc/ioctl v1.0.0/go.mod h1:ellh2YB5ldny99SBU/VX7Nq0xiZbHphf1DrtHxxjMk0= github.com/ema/qdisc v0.0.0-20200603082823-62d0308e3e00 h1:0GHzegkDz/zSrt+Zph1OueNImPdUxoToypnkhhRYTjI= github.com/ema/qdisc v0.0.0-20200603082823-62d0308e3e00/go.mod h1:ix4kG2zvdUd8kEKSW0ZTr1XLks0epFpI4j745DXxlNE= github.com/envoyproxy/go-control-plane v0.9.0/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4=