Node_Exporter/collector/systemd_linux.go
Paul Gier 8c3de12c22 systemd: check version for availability of properties (#1413)
The dbus property 'SystemState' and the timer property 'LastTriggerUSec'
were added in version 212 of systemd.
Check that the version of systemd is higher than 212 before attempting
to query these properties

f755e3b74b
dedabea4b3

Resolves issue #291

Signed-off-by: Paul Gier <pgier@redhat.com>
2019-09-04 16:27:25 +02:00

477 lines
16 KiB
Go

// Copyright 2015 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// +build !nosystemd
package collector
import (
"fmt"
"math"
"regexp"
"strconv"
"strings"
"sync"
"time"
"github.com/coreos/go-systemd/dbus"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/common/log"
kingpin "gopkg.in/alecthomas/kingpin.v2"
)
const (
// minSystemdVersionSystemState is the minimum SystemD version for availability of
// the 'SystemState' manager property and the timer property 'LastTriggerUSec'
// https://github.com/prometheus/node_exporter/issues/291
minSystemdVersionSystemState = 212
)
var (
unitWhitelist = kingpin.Flag("collector.systemd.unit-whitelist", "Regexp of systemd units to whitelist. Units must both match whitelist and not match blacklist to be included.").Default(".+").String()
unitBlacklist = kingpin.Flag("collector.systemd.unit-blacklist", "Regexp of systemd units to blacklist. Units must both match whitelist and not match blacklist to be included.").Default(".+\\.(automount|device|mount|scope|slice)").String()
systemdPrivate = kingpin.Flag("collector.systemd.private", "Establish a private, direct connection to systemd without dbus.").Bool()
enableTaskMetrics = kingpin.Flag("collector.systemd.enable-task-metrics", "Enables service unit tasks metrics unit_tasks_current and unit_tasks_max").Bool()
enableRestartsMetrics = kingpin.Flag("collector.systemd.enable-restarts-metrics", "Enables service unit metric service_restart_total").Bool()
enableStartTimeMetrics = kingpin.Flag("collector.systemd.enable-start-time-metrics", "Enables service unit metric unit_start_time_seconds").Bool()
)
type systemdCollector struct {
unitDesc *prometheus.Desc
unitStartTimeDesc *prometheus.Desc
unitTasksCurrentDesc *prometheus.Desc
unitTasksMaxDesc *prometheus.Desc
systemRunningDesc *prometheus.Desc
summaryDesc *prometheus.Desc
nRestartsDesc *prometheus.Desc
timerLastTriggerDesc *prometheus.Desc
socketAcceptedConnectionsDesc *prometheus.Desc
socketCurrentConnectionsDesc *prometheus.Desc
socketRefusedConnectionsDesc *prometheus.Desc
systemdVersionDesc *prometheus.Desc
systemdVersion int
unitWhitelistPattern *regexp.Regexp
unitBlacklistPattern *regexp.Regexp
}
var unitStatesName = []string{"active", "activating", "deactivating", "inactive", "failed"}
func init() {
registerCollector("systemd", defaultDisabled, NewSystemdCollector)
}
// NewSystemdCollector returns a new Collector exposing systemd statistics.
func NewSystemdCollector() (Collector, error) {
const subsystem = "systemd"
unitDesc := prometheus.NewDesc(
prometheus.BuildFQName(namespace, subsystem, "unit_state"),
"Systemd unit", []string{"name", "state", "type"}, nil,
)
unitStartTimeDesc := prometheus.NewDesc(
prometheus.BuildFQName(namespace, subsystem, "unit_start_time_seconds"),
"Start time of the unit since unix epoch in seconds.", []string{"name"}, nil,
)
unitTasksCurrentDesc := prometheus.NewDesc(
prometheus.BuildFQName(namespace, subsystem, "unit_tasks_current"),
"Current number of tasks per Systemd unit", []string{"name"}, nil,
)
unitTasksMaxDesc := prometheus.NewDesc(
prometheus.BuildFQName(namespace, subsystem, "unit_tasks_max"),
"Maximum number of tasks per Systemd unit", []string{"name"}, nil,
)
systemRunningDesc := prometheus.NewDesc(
prometheus.BuildFQName(namespace, subsystem, "system_running"),
"Whether the system is operational (see 'systemctl is-system-running')",
nil, nil,
)
summaryDesc := prometheus.NewDesc(
prometheus.BuildFQName(namespace, subsystem, "units"),
"Summary of systemd unit states", []string{"state"}, nil)
nRestartsDesc := prometheus.NewDesc(
prometheus.BuildFQName(namespace, subsystem, "service_restart_total"),
"Service unit count of Restart triggers", []string{"name"}, nil)
timerLastTriggerDesc := prometheus.NewDesc(
prometheus.BuildFQName(namespace, subsystem, "timer_last_trigger_seconds"),
"Seconds since epoch of last trigger.", []string{"name"}, nil)
socketAcceptedConnectionsDesc := prometheus.NewDesc(
prometheus.BuildFQName(namespace, subsystem, "socket_accepted_connections_total"),
"Total number of accepted socket connections", []string{"name"}, nil)
socketCurrentConnectionsDesc := prometheus.NewDesc(
prometheus.BuildFQName(namespace, subsystem, "socket_current_connections"),
"Current number of socket connections", []string{"name"}, nil)
socketRefusedConnectionsDesc := prometheus.NewDesc(
prometheus.BuildFQName(namespace, subsystem, "socket_refused_connections_total"),
"Total number of refused socket connections", []string{"name"}, nil)
systemdVersionDesc := prometheus.NewDesc(
prometheus.BuildFQName(namespace, subsystem, "version"),
"Detected systemd version", []string{}, nil)
unitWhitelistPattern := regexp.MustCompile(fmt.Sprintf("^(?:%s)$", *unitWhitelist))
unitBlacklistPattern := regexp.MustCompile(fmt.Sprintf("^(?:%s)$", *unitBlacklist))
systemdVersion := getSystemdVersion()
if systemdVersion < minSystemdVersionSystemState {
log.Warnf("Detected systemd version %v is lower than minimum %v", systemdVersion, minSystemdVersionSystemState)
log.Warn("Some systemd state and timer metrics will not be available")
}
return &systemdCollector{
unitDesc: unitDesc,
unitStartTimeDesc: unitStartTimeDesc,
unitTasksCurrentDesc: unitTasksCurrentDesc,
unitTasksMaxDesc: unitTasksMaxDesc,
systemRunningDesc: systemRunningDesc,
summaryDesc: summaryDesc,
nRestartsDesc: nRestartsDesc,
timerLastTriggerDesc: timerLastTriggerDesc,
socketAcceptedConnectionsDesc: socketAcceptedConnectionsDesc,
socketCurrentConnectionsDesc: socketCurrentConnectionsDesc,
socketRefusedConnectionsDesc: socketRefusedConnectionsDesc,
systemdVersionDesc: systemdVersionDesc,
systemdVersion: systemdVersion,
unitWhitelistPattern: unitWhitelistPattern,
unitBlacklistPattern: unitBlacklistPattern,
}, nil
}
// Update gathers metrics from systemd. Dbus collection is done in parallel
// to reduce wait time for responses.
func (c *systemdCollector) Update(ch chan<- prometheus.Metric) error {
begin := time.Now()
conn, err := newSystemdDbusConn()
if err != nil {
return fmt.Errorf("couldn't get dbus connection: %s", err)
}
defer conn.Close()
allUnits, err := c.getAllUnits(conn)
if err != nil {
return fmt.Errorf("couldn't get units: %s", err)
}
log.Debugf("systemd getAllUnits took %f", time.Since(begin).Seconds())
begin = time.Now()
summary := summarizeUnits(allUnits)
c.collectSummaryMetrics(ch, summary)
log.Debugf("systemd collectSummaryMetrics took %f", time.Since(begin).Seconds())
begin = time.Now()
units := filterUnits(allUnits, c.unitWhitelistPattern, c.unitBlacklistPattern)
log.Debugf("systemd filterUnits took %f", time.Since(begin).Seconds())
var wg sync.WaitGroup
defer wg.Wait()
wg.Add(1)
go func() {
defer wg.Done()
begin = time.Now()
c.collectUnitStatusMetrics(conn, ch, units)
log.Debugf("systemd collectUnitStatusMetrics took %f", time.Since(begin).Seconds())
}()
if *enableStartTimeMetrics {
wg.Add(1)
go func() {
defer wg.Done()
begin = time.Now()
c.collectUnitStartTimeMetrics(conn, ch, units)
log.Debugf("systemd collectUnitStartTimeMetrics took %f", time.Since(begin).Seconds())
}()
}
if *enableTaskMetrics {
wg.Add(1)
go func() {
defer wg.Done()
begin = time.Now()
c.collectUnitTasksMetrics(conn, ch, units)
log.Debugf("systemd collectUnitTasksMetrics took %f", time.Since(begin).Seconds())
}()
}
if c.systemdVersion >= minSystemdVersionSystemState {
wg.Add(1)
go func() {
defer wg.Done()
begin = time.Now()
c.collectTimers(conn, ch, units)
log.Debugf("systemd collectTimers took %f", time.Since(begin).Seconds())
}()
}
wg.Add(1)
go func() {
defer wg.Done()
begin = time.Now()
c.collectSockets(conn, ch, units)
log.Debugf("systemd collectSockets took %f", time.Since(begin).Seconds())
}()
if c.systemdVersion >= minSystemdVersionSystemState {
begin = time.Now()
err = c.collectSystemState(conn, ch)
log.Debugf("systemd collectSystemState took %f", time.Since(begin).Seconds())
}
ch <- prometheus.MustNewConstMetric(
c.systemdVersionDesc, prometheus.GaugeValue, float64(c.systemdVersion))
return err
}
func (c *systemdCollector) collectUnitStatusMetrics(conn *dbus.Conn, ch chan<- prometheus.Metric, units []unit) {
for _, unit := range units {
serviceType := ""
if strings.HasSuffix(unit.Name, ".service") {
serviceTypeProperty, err := conn.GetUnitTypeProperty(unit.Name, "Service", "Type")
if err != nil {
log.Debugf("couldn't get unit '%s' Type: %s", unit.Name, err)
} else {
serviceType = serviceTypeProperty.Value.Value().(string)
}
} else if strings.HasSuffix(unit.Name, ".mount") {
serviceTypeProperty, err := conn.GetUnitTypeProperty(unit.Name, "Mount", "Type")
if err != nil {
log.Debugf("couldn't get unit '%s' Type: %s", unit.Name, err)
} else {
serviceType = serviceTypeProperty.Value.Value().(string)
}
}
for _, stateName := range unitStatesName {
isActive := 0.0
if stateName == unit.ActiveState {
isActive = 1.0
}
ch <- prometheus.MustNewConstMetric(
c.unitDesc, prometheus.GaugeValue, isActive,
unit.Name, stateName, serviceType)
}
if *enableRestartsMetrics && strings.HasSuffix(unit.Name, ".service") {
// NRestarts wasn't added until systemd 235.
restartsCount, err := conn.GetUnitTypeProperty(unit.Name, "Service", "NRestarts")
if err != nil {
log.Debugf("couldn't get unit '%s' NRestarts: %s", unit.Name, err)
} else {
ch <- prometheus.MustNewConstMetric(
c.nRestartsDesc, prometheus.CounterValue,
float64(restartsCount.Value.Value().(uint32)), unit.Name)
}
}
}
}
func (c *systemdCollector) collectSockets(conn *dbus.Conn, ch chan<- prometheus.Metric, units []unit) {
for _, unit := range units {
if !strings.HasSuffix(unit.Name, ".socket") {
continue
}
acceptedConnectionCount, err := conn.GetUnitTypeProperty(unit.Name, "Socket", "NAccepted")
if err != nil {
log.Debugf("couldn't get unit '%s' NAccepted: %s", unit.Name, err)
continue
}
ch <- prometheus.MustNewConstMetric(
c.socketAcceptedConnectionsDesc, prometheus.CounterValue,
float64(acceptedConnectionCount.Value.Value().(uint32)), unit.Name)
currentConnectionCount, err := conn.GetUnitTypeProperty(unit.Name, "Socket", "NConnections")
if err != nil {
log.Debugf("couldn't get unit '%s' NConnections: %s", unit.Name, err)
continue
}
ch <- prometheus.MustNewConstMetric(
c.socketCurrentConnectionsDesc, prometheus.GaugeValue,
float64(currentConnectionCount.Value.Value().(uint32)), unit.Name)
// NRefused wasn't added until systemd 239.
refusedConnectionCount, err := conn.GetUnitTypeProperty(unit.Name, "Socket", "NRefused")
if err != nil {
//log.Debugf("couldn't get unit '%s' NRefused: %s", unit.Name, err)
} else {
ch <- prometheus.MustNewConstMetric(
c.socketRefusedConnectionsDesc, prometheus.GaugeValue,
float64(refusedConnectionCount.Value.Value().(uint32)), unit.Name)
}
}
}
func (c *systemdCollector) collectUnitStartTimeMetrics(conn *dbus.Conn, ch chan<- prometheus.Metric, units []unit) {
var startTimeUsec uint64
for _, unit := range units {
if unit.ActiveState != "active" {
startTimeUsec = 0
} else {
timestampValue, err := conn.GetUnitProperty(unit.Name, "ActiveEnterTimestamp")
if err != nil {
log.Debugf("couldn't get unit '%s' StartTimeUsec: %s", unit.Name, err)
continue
}
startTimeUsec = timestampValue.Value.Value().(uint64)
}
ch <- prometheus.MustNewConstMetric(
c.unitStartTimeDesc, prometheus.GaugeValue,
float64(startTimeUsec)/1e6, unit.Name)
}
}
func (c *systemdCollector) collectUnitTasksMetrics(conn *dbus.Conn, ch chan<- prometheus.Metric, units []unit) {
var val uint64
for _, unit := range units {
if strings.HasSuffix(unit.Name, ".service") {
tasksCurrentCount, err := conn.GetUnitTypeProperty(unit.Name, "Service", "TasksCurrent")
if err != nil {
log.Debugf("couldn't get unit '%s' TasksCurrent: %s", unit.Name, err)
} else {
val = tasksCurrentCount.Value.Value().(uint64)
// Don't set if tasksCurrent if dbus reports MaxUint64.
if val != math.MaxUint64 {
ch <- prometheus.MustNewConstMetric(
c.unitTasksCurrentDesc, prometheus.GaugeValue,
float64(val), unit.Name)
}
}
tasksMaxCount, err := conn.GetUnitTypeProperty(unit.Name, "Service", "TasksMax")
if err != nil {
log.Debugf("couldn't get unit '%s' TasksMax: %s", unit.Name, err)
} else {
val = tasksMaxCount.Value.Value().(uint64)
// Don't set if tasksMax if dbus reports MaxUint64.
if val != math.MaxUint64 {
ch <- prometheus.MustNewConstMetric(
c.unitTasksMaxDesc, prometheus.GaugeValue,
float64(val), unit.Name)
}
}
}
}
}
func (c *systemdCollector) collectTimers(conn *dbus.Conn, ch chan<- prometheus.Metric, units []unit) {
for _, unit := range units {
if !strings.HasSuffix(unit.Name, ".timer") {
continue
}
lastTriggerValue, err := conn.GetUnitTypeProperty(unit.Name, "Timer", "LastTriggerUSec")
if err != nil {
log.Debugf("couldn't get unit '%s' LastTriggerUSec: %s", unit.Name, err)
continue
}
ch <- prometheus.MustNewConstMetric(
c.timerLastTriggerDesc, prometheus.GaugeValue,
float64(lastTriggerValue.Value.Value().(uint64))/1e6, unit.Name)
}
}
func (c *systemdCollector) collectSummaryMetrics(ch chan<- prometheus.Metric, summary map[string]float64) {
for stateName, count := range summary {
ch <- prometheus.MustNewConstMetric(
c.summaryDesc, prometheus.GaugeValue, count, stateName)
}
}
func (c *systemdCollector) collectSystemState(conn *dbus.Conn, ch chan<- prometheus.Metric) error {
systemState, err := conn.GetManagerProperty("SystemState")
if err != nil {
return fmt.Errorf("couldn't get system state: %s", err)
}
isSystemRunning := 0.0
if systemState == `"running"` {
isSystemRunning = 1.0
}
ch <- prometheus.MustNewConstMetric(c.systemRunningDesc, prometheus.GaugeValue, isSystemRunning)
return nil
}
func newSystemdDbusConn() (*dbus.Conn, error) {
if *systemdPrivate {
return dbus.NewSystemdConnection()
}
return dbus.New()
}
type unit struct {
dbus.UnitStatus
}
func (c *systemdCollector) getAllUnits(conn *dbus.Conn) ([]unit, error) {
allUnits, err := conn.ListUnits()
if err != nil {
return nil, err
}
result := make([]unit, 0, len(allUnits))
for _, status := range allUnits {
unit := unit{
UnitStatus: status,
}
result = append(result, unit)
}
return result, nil
}
func summarizeUnits(units []unit) map[string]float64 {
summarized := make(map[string]float64)
for _, unitStateName := range unitStatesName {
summarized[unitStateName] = 0.0
}
for _, unit := range units {
summarized[unit.ActiveState] += 1.0
}
return summarized
}
func filterUnits(units []unit, whitelistPattern, blacklistPattern *regexp.Regexp) []unit {
filtered := make([]unit, 0, len(units))
for _, unit := range units {
if whitelistPattern.MatchString(unit.Name) && !blacklistPattern.MatchString(unit.Name) && unit.LoadState == "loaded" {
log.Debugf("Adding unit: %s", unit.Name)
filtered = append(filtered, unit)
} else {
log.Debugf("Ignoring unit: %s", unit.Name)
}
}
return filtered
}
func getSystemdVersion() int {
conn, err := newSystemdDbusConn()
if err != nil {
log.Warnf("Unable to get systemd dbus connection, defaulting systemd version to 0: %s", err)
return 0
}
defer conn.Close()
version, err := conn.GetManagerProperty("Version")
if err != nil {
log.Warn("Unable to get systemd version property, defaulting to 0")
return 0
}
version = strings.Replace(version, "\"", "", 2)
v, err := strconv.Atoi(version)
if err != nil {
log.Warnf("Got invalid systemd version: %v", version)
return 0
}
return v
}