Node_Exporter/collector/btrfs_linux.go
Marcus Cobden 135710d8b5
Add btrfs device error stats (#2193)
* Improve metrics filesystem scanning logic
* Makes ioctl syscalls to load the device error stats.
* Adds filesystem mountpoint labels to existing metrics for ease of use.

Signed-off-by: Marcus Cobden <leth@users.noreply.github.com>
2022-09-24 08:25:15 +02:00

410 lines
12 KiB
Go

// Copyright 2019 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//go:build !nobtrfs
// +build !nobtrfs
package collector
import (
"fmt"
"path"
"strings"
"syscall"
dennwc "github.com/dennwc/btrfs"
"github.com/go-kit/log"
"github.com/go-kit/log/level"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/procfs/btrfs"
)
// A btrfsCollector is a Collector which gathers metrics from Btrfs filesystems.
type btrfsCollector struct {
fs btrfs.FS
logger log.Logger
}
func init() {
registerCollector("btrfs", defaultEnabled, NewBtrfsCollector)
}
// NewBtrfsCollector returns a new Collector exposing Btrfs statistics.
func NewBtrfsCollector(logger log.Logger) (Collector, error) {
fs, err := btrfs.NewFS(*sysPath)
if err != nil {
return nil, fmt.Errorf("failed to open sysfs: %w", err)
}
return &btrfsCollector{
fs: fs,
logger: logger,
}, nil
}
// Update retrieves and exports Btrfs statistics.
// It implements Collector.
func (c *btrfsCollector) Update(ch chan<- prometheus.Metric) error {
stats, err := c.fs.Stats()
if err != nil {
return fmt.Errorf("failed to retrieve Btrfs stats from procfs: %w", err)
}
ioctlStatsMap, err := c.getIoctlStats()
if err != nil {
level.Debug(c.logger).Log(
"msg", "Error querying btrfs device stats with ioctl",
"err", err)
ioctlStatsMap = make(map[string]*btrfsIoctlFsStats)
}
for _, s := range stats {
// match up procfs and ioctl info by filesystem UUID (without dashes)
var fsUUID = strings.Replace(s.UUID, "-", "", -1)
ioctlStats := ioctlStatsMap[fsUUID]
c.updateBtrfsStats(ch, s, ioctlStats)
}
return nil
}
type btrfsIoctlFsDevStats struct {
path string
uuid string
bytesUsed uint64
totalBytes uint64
// The error stats below match the following upstream lists:
// https://github.com/dennwc/btrfs/blob/b3db0b2dedac3bf580f412034d77e0bf4b420167/btrfs.go#L132-L140
// https://github.com/torvalds/linux/blob/70d605cbeecb408dd884b1f0cd3963eeeaac144c/include/uapi/linux/btrfs.h#L680-L692
writeErrs uint64
readErrs uint64
flushErrs uint64
corruptionErrs uint64
generationErrs uint64
}
type btrfsIoctlFsStats struct {
uuid string
devices []btrfsIoctlFsDevStats
}
func (c *btrfsCollector) getIoctlStats() (map[string]*btrfsIoctlFsStats, error) {
// Instead of introducing more ioctl calls to scan for all btrfs
// filesytems re-use our mount point utils to find known mounts
mountsList, err := mountPointDetails(c.logger)
if err != nil {
return nil, err
}
// track devices we have successfully scanned, by device path
devicesDone := make(map[string]struct{})
// filesystems scann results by UUID
fsStats := make(map[string]*btrfsIoctlFsStats)
for _, mount := range mountsList {
if mount.fsType != "btrfs" {
continue
}
if _, found := devicesDone[mount.device]; found {
// We already found this filesystem by another mount point
continue
}
fs, err := dennwc.Open(mount.mountPoint, true)
if err != nil {
// failed to open this mount point, maybe we didn't have permission
// maybe we'll find another mount point for this FS later
level.Debug(c.logger).Log(
"msg", "Error inspecting btrfs mountpoint",
"mountPoint", mount.mountPoint,
"err", err)
continue
}
fsInfo, err := fs.Info()
if err != nil {
// Failed to get the FS info for some reason,
// perhaps it'll work with a different mount point
level.Debug(c.logger).Log(
"msg", "Error querying btrfs filesystem",
"mountPoint", mount.mountPoint,
"err", err)
continue
}
fsID := fsInfo.FSID.String()
if _, found := fsStats[fsID]; found {
// We already found this filesystem by another mount point
continue
}
deviceStats, err := c.getIoctlDeviceStats(fs, &fsInfo)
if err != nil {
level.Debug(c.logger).Log(
"msg", "Error querying btrfs device stats",
"mountPoint", mount.mountPoint,
"err", err)
continue
}
devicesDone[mount.device] = struct{}{}
fsStats[fsID] = &btrfsIoctlFsStats{
uuid: fsID,
devices: deviceStats,
}
}
return fsStats, nil
}
func (c *btrfsCollector) getIoctlDeviceStats(fs *dennwc.FS, fsInfo *dennwc.Info) ([]btrfsIoctlFsDevStats, error) {
devices := make([]btrfsIoctlFsDevStats, 0, fsInfo.NumDevices)
for i := uint64(0); i <= fsInfo.MaxID; i++ {
deviceInfo, err := fs.GetDevInfo(i)
if err != nil {
if errno, ok := err.(syscall.Errno); ok && errno == syscall.ENODEV {
// device IDs do not consistently start at 0, nor are ranges contiguous, so we expect this
continue
}
return nil, err
}
deviceStats, err := fs.GetDevStats(i)
if err != nil {
return nil, err
}
devices = append(devices, btrfsIoctlFsDevStats{
path: deviceInfo.Path,
uuid: deviceInfo.UUID.String(),
bytesUsed: deviceInfo.BytesUsed,
totalBytes: deviceInfo.TotalBytes,
writeErrs: deviceStats.WriteErrs,
readErrs: deviceStats.ReadErrs,
flushErrs: deviceStats.FlushErrs,
corruptionErrs: deviceStats.CorruptionErrs,
generationErrs: deviceStats.GenerationErrs,
})
if uint64(len(devices)) == fsInfo.NumDevices {
break
}
}
return devices, nil
}
// btrfsMetric represents a single Btrfs metric that is converted into a Prometheus Metric.
type btrfsMetric struct {
name string
metricType prometheus.ValueType
desc string
value float64
extraLabel []string
extraLabelValue []string
}
// updateBtrfsStats collects statistics for one bcache ID.
func (c *btrfsCollector) updateBtrfsStats(ch chan<- prometheus.Metric, s *btrfs.Stats, ioctlStats *btrfsIoctlFsStats) {
const subsystem = "btrfs"
// Basic information about the filesystem.
devLabels := []string{"uuid"}
// Retrieve the metrics.
metrics := c.getMetrics(s, ioctlStats)
// Convert all gathered metrics to Prometheus Metrics and add to channel.
for _, m := range metrics {
labels := append(devLabels, m.extraLabel...)
desc := prometheus.NewDesc(
prometheus.BuildFQName(namespace, subsystem, m.name),
m.desc,
labels,
nil,
)
labelValues := []string{s.UUID}
if len(m.extraLabelValue) > 0 {
labelValues = append(labelValues, m.extraLabelValue...)
}
ch <- prometheus.MustNewConstMetric(
desc,
m.metricType,
m.value,
labelValues...,
)
}
}
// getMetrics returns metrics for the given Btrfs statistics.
func (c *btrfsCollector) getMetrics(s *btrfs.Stats, ioctlStats *btrfsIoctlFsStats) []btrfsMetric {
metrics := []btrfsMetric{
{
name: "info",
desc: "Filesystem information",
value: 1,
metricType: prometheus.GaugeValue,
extraLabel: []string{"label"},
extraLabelValue: []string{s.Label},
},
{
name: "global_rsv_size_bytes",
desc: "Size of global reserve.",
metricType: prometheus.GaugeValue,
value: float64(s.Allocation.GlobalRsvSize),
},
}
// Information about data, metadata and system data.
metrics = append(metrics, c.getAllocationStats("data", s.Allocation.Data)...)
metrics = append(metrics, c.getAllocationStats("metadata", s.Allocation.Metadata)...)
metrics = append(metrics, c.getAllocationStats("system", s.Allocation.System)...)
// Information about devices.
if ioctlStats == nil {
for n, dev := range s.Devices {
metrics = append(metrics, btrfsMetric{
name: "device_size_bytes",
desc: "Size of a device that is part of the filesystem.",
metricType: prometheus.GaugeValue,
value: float64(dev.Size),
extraLabel: []string{"device"},
extraLabelValue: []string{n},
})
}
return metrics
}
for _, dev := range ioctlStats.devices {
// trim the path prefix from the device name so the value should match
// the value used in the fallback branch above.
// e.g. /dev/sda -> sda, /rootfs/dev/md1 -> md1
_, device := path.Split(dev.path)
extraLabels := []string{"device", "btrfs_dev_uuid"}
extraLabelValues := []string{device, dev.uuid}
metrics = append(metrics,
btrfsMetric{
name: "device_size_bytes",
desc: "Size of a device that is part of the filesystem.",
metricType: prometheus.GaugeValue,
value: float64(dev.totalBytes),
extraLabel: extraLabels,
extraLabelValue: extraLabelValues,
},
// A bytes available metric is probably more useful than a
// bytes used metric, because large numbers of bytes will
// suffer from floating point representation issues
// and we probably care more about the number when it's low anyway
btrfsMetric{
name: "device_unused_bytes",
desc: "Unused bytes unused on a device that is part of the filesystem.",
metricType: prometheus.GaugeValue,
value: float64(dev.totalBytes - dev.bytesUsed),
extraLabel: extraLabels,
extraLabelValue: extraLabelValues,
})
errorLabels := append([]string{"type"}, extraLabels...)
values := []uint64{
dev.writeErrs,
dev.readErrs,
dev.flushErrs,
dev.corruptionErrs,
dev.generationErrs,
}
btrfsErrorTypeNames := []string{
"write",
"read",
"flush",
"corruption",
"generation",
}
for i, errorType := range btrfsErrorTypeNames {
metrics = append(metrics,
btrfsMetric{
name: "device_errors_total",
desc: "Errors reported for the device",
metricType: prometheus.CounterValue,
value: float64(values[i]),
extraLabel: errorLabels,
extraLabelValue: append([]string{errorType}, extraLabelValues...),
})
}
}
return metrics
}
// getAllocationStats returns allocation metrics for the given Btrfs Allocation statistics.
func (c *btrfsCollector) getAllocationStats(a string, s *btrfs.AllocationStats) []btrfsMetric {
metrics := []btrfsMetric{
{
name: "reserved_bytes",
desc: "Amount of space reserved for a data type",
metricType: prometheus.GaugeValue,
value: float64(s.ReservedBytes),
extraLabel: []string{"block_group_type"},
extraLabelValue: []string{a},
},
}
// Add all layout statistics.
for layout, stats := range s.Layouts {
metrics = append(metrics, c.getLayoutStats(a, layout, stats)...)
}
return metrics
}
// getLayoutStats returns metrics for a data layout.
func (c *btrfsCollector) getLayoutStats(a, l string, s *btrfs.LayoutUsage) []btrfsMetric {
return []btrfsMetric{
{
name: "used_bytes",
desc: "Amount of used space by a layout/data type",
metricType: prometheus.GaugeValue,
value: float64(s.UsedBytes),
extraLabel: []string{"block_group_type", "mode"},
extraLabelValue: []string{a, l},
},
{
name: "size_bytes",
desc: "Amount of space allocated for a layout/data type",
metricType: prometheus.GaugeValue,
value: float64(s.TotalBytes),
extraLabel: []string{"block_group_type", "mode"},
extraLabelValue: []string{a, l},
},
{
name: "allocation_ratio",
desc: "Data allocation ratio for a layout/data type",
metricType: prometheus.GaugeValue,
value: s.Ratio,
extraLabel: []string{"block_group_type", "mode"},
extraLabelValue: []string{a, l},
},
}
}