2016-10-06 17:33:24 +02:00
// Copyright 2016 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
2021-10-03 13:35:24 +02:00
//go:build !nohwmon
2016-10-06 17:33:24 +02:00
// +build !nohwmon
package collector
import (
"errors"
2024-09-26 00:00:04 +02:00
"fmt"
2024-09-11 10:51:28 +02:00
"log/slog"
2016-10-06 17:33:24 +02:00
"os"
"path/filepath"
"regexp"
"strconv"
"strings"
2023-07-07 10:30:24 +02:00
"github.com/alecthomas/kingpin/v2"
2016-10-06 17:33:24 +02:00
"github.com/prometheus/client_golang/prometheus"
2019-05-10 20:04:06 +02:00
"golang.org/x/sys/unix"
2016-10-06 17:33:24 +02:00
)
var (
2024-07-14 13:50:06 +02:00
collectorHWmonChipInclude = kingpin . Flag ( "collector.hwmon.chip-include" , "Regexp of hwmon chip to include (mutually exclusive to device-exclude)." ) . String ( )
collectorHWmonChipExclude = kingpin . Flag ( "collector.hwmon.chip-exclude" , "Regexp of hwmon chip to exclude (mutually exclusive to device-include)." ) . String ( )
collectorHWmonSensorInclude = kingpin . Flag ( "collector.hwmon.sensor-include" , "Regexp of hwmon sensor to include (mutually exclusive to sensor-exclude)." ) . String ( )
collectorHWmonSensorExclude = kingpin . Flag ( "collector.hwmon.sensor-exclude" , "Regexp of hwmon sensor to exclude (mutually exclusive to sensor-include)." ) . String ( )
2023-07-07 10:30:24 +02:00
2016-10-06 17:33:24 +02:00
hwmonInvalidMetricChars = regexp . MustCompile ( "[^a-z0-9:_]" )
hwmonFilenameFormat = regexp . MustCompile ( ` ^(?P<type>[^0-9]+)(?P<id>[0-9]*)?(_(?P<property>.+))?$ ` )
hwmonLabelDesc = [ ] string { "chip" , "sensor" }
2016-11-29 11:53:29 +01:00
hwmonChipNameLabelDesc = [ ] string { "chip" , "chip_name" }
2016-10-06 17:33:24 +02:00
hwmonSensorTypes = [ ] string {
"vrm" , "beep_enable" , "update_interval" , "in" , "cpu" , "fan" ,
"pwm" , "temp" , "curr" , "power" , "energy" , "humidity" ,
2024-08-22 22:42:48 +02:00
"intrusion" , "freq" ,
2016-10-06 17:33:24 +02:00
}
)
func init ( ) {
2017-09-28 15:06:26 +02:00
registerCollector ( "hwmon" , defaultEnabled , NewHwMonCollector )
2016-10-06 17:33:24 +02:00
}
2019-12-31 17:19:37 +01:00
type hwMonCollector struct {
2023-07-07 10:30:24 +02:00
deviceFilter deviceFilter
2024-07-14 13:50:06 +02:00
sensorFilter deviceFilter
2024-09-11 10:51:28 +02:00
logger * slog . Logger
2019-12-31 17:19:37 +01:00
}
2016-10-06 17:33:24 +02:00
2017-02-28 17:44:53 +01:00
// NewHwMonCollector returns a new Collector exposing /sys/class/hwmon stats
// (similar to lm-sensors).
2024-09-11 10:51:28 +02:00
func NewHwMonCollector ( logger * slog . Logger ) ( Collector , error ) {
2023-07-07 10:30:24 +02:00
return & hwMonCollector {
logger : logger ,
2023-07-10 12:46:30 +02:00
deviceFilter : newDeviceFilter ( * collectorHWmonChipExclude , * collectorHWmonChipInclude ) ,
2024-07-14 13:50:06 +02:00
sensorFilter : newDeviceFilter ( * collectorHWmonSensorExclude , * collectorHWmonSensorInclude ) ,
2023-07-07 10:30:24 +02:00
} , nil
2016-10-06 17:33:24 +02:00
}
func cleanMetricName ( name string ) string {
lower := strings . ToLower ( name )
replaced := hwmonInvalidMetricChars . ReplaceAllLiteralString ( lower , "_" )
cleaned := strings . Trim ( replaced , "_" )
return cleaned
}
func addValueFile ( data map [ string ] map [ string ] string , sensor string , prop string , file string ) {
2017-11-07 07:49:37 +01:00
raw , err := sysReadFile ( file )
if err != nil {
2016-10-06 17:33:24 +02:00
return
}
value := strings . Trim ( string ( raw ) , "\n" )
if _ , ok := data [ sensor ] ; ! ok {
data [ sensor ] = make ( map [ string ] string )
}
data [ sensor ] [ prop ] = value
}
2022-07-27 20:59:39 +02:00
// sysReadFile is a simplified os.ReadFile that invokes syscall.Read directly.
2017-11-07 07:49:37 +01:00
func sysReadFile ( file string ) ( [ ] byte , error ) {
f , err := os . Open ( file )
if err != nil {
return nil , err
}
defer f . Close ( )
// On some machines, hwmon drivers are broken and return EAGAIN. This causes
2022-07-27 20:59:39 +02:00
// Go's os.ReadFile implementation to poll forever.
2017-11-07 07:49:37 +01:00
//
// Since we either want to read data or bail immediately, do the simplest
2019-05-10 20:04:06 +02:00
// possible read using system call directly.
2017-11-07 07:49:37 +01:00
b := make ( [ ] byte , 128 )
2019-05-10 20:04:06 +02:00
n , err := unix . Read ( int ( f . Fd ( ) ) , b )
2017-11-07 07:49:37 +01:00
if err != nil {
return nil , err
}
2024-09-26 00:00:04 +02:00
if n < 0 {
return nil , fmt . Errorf ( "failed to read file: %q, read returned negative bytes value: %d" , file , n )
}
2017-11-07 07:49:37 +01:00
return b [ : n ] , nil
}
2017-02-28 17:44:53 +01:00
// explodeSensorFilename splits a sensor name into <type><num>_<property>.
2016-10-06 17:33:24 +02:00
func explodeSensorFilename ( filename string ) ( ok bool , sensorType string , sensorNum int , sensorProperty string ) {
matches := hwmonFilenameFormat . FindStringSubmatch ( filename )
if len ( matches ) == 0 {
return false , sensorType , sensorNum , sensorProperty
}
for i , match := range hwmonFilenameFormat . SubexpNames ( ) {
if i >= len ( matches ) {
return true , sensorType , sensorNum , sensorProperty
}
if match == "type" {
sensorType = matches [ i ]
}
if match == "property" {
sensorProperty = matches [ i ]
}
if match == "id" && len ( matches [ i ] ) > 0 {
if num , err := strconv . Atoi ( matches [ i ] ) ; err == nil {
sensorNum = num
} else {
return false , sensorType , sensorNum , sensorProperty
}
}
}
return true , sensorType , sensorNum , sensorProperty
}
2017-02-28 22:33:46 +01:00
func collectSensorData ( dir string , data map [ string ] map [ string ] string ) error {
2022-07-27 20:59:39 +02:00
sensorFiles , dirError := os . ReadDir ( dir )
2016-10-06 17:33:24 +02:00
if dirError != nil {
return dirError
}
for _ , file := range sensorFiles {
filename := file . Name ( )
ok , sensorType , sensorNum , sensorProperty := explodeSensorFilename ( filename )
if ! ok {
continue
}
for _ , t := range hwmonSensorTypes {
if t == sensorType {
2019-02-05 16:37:27 +01:00
addValueFile ( data , sensorType + strconv . Itoa ( sensorNum ) , sensorProperty , filepath . Join ( dir , file . Name ( ) ) )
2016-10-06 17:33:24 +02:00
break
}
}
}
return nil
}
2017-02-28 22:33:46 +01:00
func ( c * hwMonCollector ) updateHwmon ( ch chan <- prometheus . Metric , dir string ) error {
2016-10-06 17:33:24 +02:00
hwmonName , err := c . hwmonName ( dir )
if err != nil {
return err
}
2023-07-07 10:30:24 +02:00
if c . deviceFilter . ignored ( hwmonName ) {
2024-09-11 10:51:28 +02:00
c . logger . Debug ( "ignoring hwmon chip" , "chip" , hwmonName )
2023-07-07 10:30:24 +02:00
return nil
}
2016-10-06 17:33:24 +02:00
data := make ( map [ string ] map [ string ] string )
err = collectSensorData ( dir , data )
if err != nil {
return err
}
2019-02-05 16:37:27 +01:00
if _ , err := os . Stat ( filepath . Join ( dir , "device" ) ) ; err == nil {
err := collectSensorData ( filepath . Join ( dir , "device" ) , data )
2016-10-06 17:33:24 +02:00
if err != nil {
return err
}
}
2016-11-29 11:53:29 +01:00
hwmonChipName , err := c . hwmonHumanReadableChipName ( dir )
if err == nil {
// sensor chip metadata
desc := prometheus . NewDesc (
"node_hwmon_chip_names" ,
"Annotation metric for human-readable chip names" ,
hwmonChipNameLabelDesc ,
nil ,
)
ch <- prometheus . MustNewConstMetric (
desc ,
prometheus . GaugeValue ,
1.0 ,
hwmonName ,
hwmonChipName ,
)
}
2017-02-28 17:44:53 +01:00
// Format all sensors.
2016-10-06 17:33:24 +02:00
for sensor , sensorData := range data {
2024-07-14 13:50:06 +02:00
// Filtering for sensors is done on concatenated device name and sensor name
// separated by a semicolon. This allows for excluding or including of specific
// sensors on specific devices. For example, to exclude the sensor "temp3" on
// the device "platform_coretemp_0", use "platform_coretemp_0;temp3"
if c . sensorFilter . ignored ( hwmonName + ";" + sensor ) {
2024-09-11 10:51:28 +02:00
c . logger . Debug ( "ignoring sensor" , "sensor" , sensor )
2024-07-14 13:50:06 +02:00
continue
}
2016-10-06 17:33:24 +02:00
_ , sensorType , _ , _ := explodeSensorFilename ( sensor )
2017-01-09 18:33:31 +01:00
labels := [ ] string { hwmonName , sensor }
2016-10-06 17:33:24 +02:00
if labelText , ok := sensorData [ "label" ] ; ok {
2022-10-11 14:40:28 +02:00
label := strings . ToValidUTF8 ( labelText , "<22> " )
desc := prometheus . NewDesc ( "node_hwmon_sensor_label" , "Label for given chip and sensor" ,
[ ] string { "chip" , "sensor" , "label" } , nil )
ch <- prometheus . MustNewConstMetric ( desc , prometheus . GaugeValue , 1.0 , hwmonName , sensor , label )
2016-10-06 17:33:24 +02:00
}
if sensorType == "beep_enable" {
value := 0.0
if sensorData [ "" ] == "1" {
value = 1.0
}
metricName := "node_hwmon_beep_enabled"
desc := prometheus . NewDesc ( metricName , "Hardware beep enabled" , hwmonLabelDesc , nil )
ch <- prometheus . MustNewConstMetric (
desc , prometheus . GaugeValue , value , labels ... )
continue
}
if sensorType == "vrm" {
parsedValue , err := strconv . ParseFloat ( sensorData [ "" ] , 64 )
if err != nil {
continue
}
metricName := "node_hwmon_voltage_regulator_version"
desc := prometheus . NewDesc ( metricName , "Hardware voltage regulator" , hwmonLabelDesc , nil )
ch <- prometheus . MustNewConstMetric (
desc , prometheus . GaugeValue , parsedValue , labels ... )
continue
}
if sensorType == "update_interval" {
parsedValue , err := strconv . ParseFloat ( sensorData [ "" ] , 64 )
if err != nil {
continue
}
metricName := "node_hwmon_update_interval_seconds"
desc := prometheus . NewDesc ( metricName , "Hardware monitor update interval" , hwmonLabelDesc , nil )
ch <- prometheus . MustNewConstMetric (
desc , prometheus . GaugeValue , parsedValue * 0.001 , labels ... )
continue
}
prefix := "node_hwmon_" + sensorType
for element , value := range sensorData {
if element == "label" {
continue
}
name := prefix
if element == "input" {
// input is actually the value
if _ , ok := sensorData [ "" ] ; ok {
name = name + "_input"
}
} else if element != "" {
name = name + "_" + cleanMetricName ( element )
}
parsedValue , err := strconv . ParseFloat ( value , 64 )
if err != nil {
continue
}
// special elements, fault, alarm & beep should be handed out without units
if element == "fault" || element == "alarm" {
desc := prometheus . NewDesc ( name , "Hardware sensor " + element + " status (" + sensorType + ")" , hwmonLabelDesc , nil )
ch <- prometheus . MustNewConstMetric ( desc , prometheus . GaugeValue , parsedValue , labels ... )
continue
}
if element == "beep" {
desc := prometheus . NewDesc ( name + "_enabled" , "Hardware monitor sensor has beeping enabled" , hwmonLabelDesc , nil )
ch <- prometheus . MustNewConstMetric ( desc , prometheus . GaugeValue , parsedValue , labels ... )
continue
}
// everything else should get a unit
if sensorType == "in" || sensorType == "cpu" {
desc := prometheus . NewDesc ( name + "_volts" , "Hardware monitor for voltage (" + element + ")" , hwmonLabelDesc , nil )
ch <- prometheus . MustNewConstMetric (
desc , prometheus . GaugeValue , parsedValue * 0.001 , labels ... )
continue
}
if sensorType == "temp" && element != "type" {
2018-10-30 18:49:22 +01:00
if element == "" {
element = "input"
}
2016-10-06 17:33:24 +02:00
desc := prometheus . NewDesc ( name + "_celsius" , "Hardware monitor for temperature (" + element + ")" , hwmonLabelDesc , nil )
ch <- prometheus . MustNewConstMetric (
desc , prometheus . GaugeValue , parsedValue * 0.001 , labels ... )
continue
}
if sensorType == "curr" {
desc := prometheus . NewDesc ( name + "_amps" , "Hardware monitor for current (" + element + ")" , hwmonLabelDesc , nil )
ch <- prometheus . MustNewConstMetric (
desc , prometheus . GaugeValue , parsedValue * 0.001 , labels ... )
continue
}
if sensorType == "energy" {
desc := prometheus . NewDesc ( name + "_joule_total" , "Hardware monitor for joules used so far (" + element + ")" , hwmonLabelDesc , nil )
ch <- prometheus . MustNewConstMetric (
desc , prometheus . CounterValue , parsedValue / 1000000.0 , labels ... )
continue
}
if sensorType == "power" && element == "accuracy" {
desc := prometheus . NewDesc ( name , "Hardware monitor power meter accuracy, as a ratio" , hwmonLabelDesc , nil )
ch <- prometheus . MustNewConstMetric (
desc , prometheus . GaugeValue , parsedValue / 1000000.0 , labels ... )
continue
}
if sensorType == "power" && ( element == "average_interval" || element == "average_interval_min" || element == "average_interval_max" ) {
desc := prometheus . NewDesc ( name + "_seconds" , "Hardware monitor power usage update interval (" + element + ")" , hwmonLabelDesc , nil )
ch <- prometheus . MustNewConstMetric (
desc , prometheus . GaugeValue , parsedValue * 0.001 , labels ... )
continue
}
if sensorType == "power" {
desc := prometheus . NewDesc ( name + "_watt" , "Hardware monitor for power usage in watts (" + element + ")" , hwmonLabelDesc , nil )
ch <- prometheus . MustNewConstMetric (
desc , prometheus . GaugeValue , parsedValue / 1000000.0 , labels ... )
continue
}
if sensorType == "humidity" {
desc := prometheus . NewDesc ( name , "Hardware monitor for humidity, as a ratio (multiply with 100.0 to get the humidity as a percentage) (" + element + ")" , hwmonLabelDesc , nil )
ch <- prometheus . MustNewConstMetric (
desc , prometheus . GaugeValue , parsedValue / 1000000.0 , labels ... )
continue
}
if sensorType == "fan" && ( element == "input" || element == "min" || element == "max" || element == "target" ) {
desc := prometheus . NewDesc ( name + "_rpm" , "Hardware monitor for fan revolutions per minute (" + element + ")" , hwmonLabelDesc , nil )
ch <- prometheus . MustNewConstMetric (
desc , prometheus . GaugeValue , parsedValue , labels ... )
continue
}
2024-08-22 22:42:48 +02:00
if sensorType == "freq" && element == "input" {
if label , ok := sensorData [ "label" ] ; ok {
sensorLabel := cleanMetricName ( label )
2024-08-23 22:34:23 +02:00
desc := prometheus . NewDesc ( name + "_freq_mhz" , "Hardware monitor for GPU frequency in MHz" , hwmonLabelDesc , nil )
2024-08-22 22:42:48 +02:00
ch <- prometheus . MustNewConstMetric (
desc , prometheus . GaugeValue , parsedValue / 1000000.0 , append ( labels [ : len ( labels ) - 1 ] , sensorLabel ) ... )
}
continue
}
2016-10-06 17:33:24 +02:00
// fallback, just dump the metric as is
desc := prometheus . NewDesc ( name , "Hardware monitor " + sensorType + " element " + element , hwmonLabelDesc , nil )
ch <- prometheus . MustNewConstMetric (
desc , prometheus . GaugeValue , parsedValue , labels ... )
}
}
return nil
}
func ( c * hwMonCollector ) hwmonName ( dir string ) ( string , error ) {
// generate a name for a sensor path
// sensor numbering depends on the order of linux module loading and
// is thus unstable.
// However the path of the device has to be stable:
// - /sys/devices/<bus>/<device>
// Some hardware monitors have a "name" file that exports a human
2021-05-26 03:58:37 +02:00
// readable name that can be used.
2016-10-06 17:33:24 +02:00
// human readable names would be bat0 or coretemp, while a path string
// could be platform_applesmc.768
2016-10-28 21:25:44 +02:00
// preference 1: construct a name based on device name, always unique
2016-10-06 17:33:24 +02:00
2019-02-05 16:37:27 +01:00
devicePath , devErr := filepath . EvalSymlinks ( filepath . Join ( dir , "device" ) )
2016-10-06 17:33:24 +02:00
if devErr == nil {
2019-02-05 16:37:27 +01:00
devPathPrefix , devName := filepath . Split ( devicePath )
_ , devType := filepath . Split ( strings . TrimRight ( devPathPrefix , "/" ) )
2016-10-06 17:33:24 +02:00
cleanDevName := cleanMetricName ( devName )
cleanDevType := cleanMetricName ( devType )
if cleanDevType != "" && cleanDevName != "" {
return cleanDevType + "_" + cleanDevName , nil
}
if cleanDevName != "" {
return cleanDevName , nil
}
}
2016-10-28 21:25:44 +02:00
// preference 2: is there a name file
2022-07-27 20:59:39 +02:00
sysnameRaw , nameErr := os . ReadFile ( filepath . Join ( dir , "name" ) )
2016-10-28 21:25:44 +02:00
if nameErr == nil && string ( sysnameRaw ) != "" {
cleanName := cleanMetricName ( string ( sysnameRaw ) )
if cleanName != "" {
return cleanName , nil
}
}
2016-10-06 17:33:24 +02:00
// it looks bad, name and device don't provide enough information
// return a hwmon[0-9]* name
realDir , err := filepath . EvalSymlinks ( dir )
if err != nil {
return "" , err
}
// take the last path element, this will be hwmonX
2019-02-05 16:37:27 +01:00
_ , name := filepath . Split ( realDir )
2016-10-06 17:33:24 +02:00
cleanName := cleanMetricName ( name )
if cleanName != "" {
return cleanName , nil
}
return "" , errors . New ( "Could not derive a monitoring name for " + dir )
}
2017-01-03 14:41:05 +01:00
// hwmonHumanReadableChipName is similar to the methods in hwmonName, but with
// different precedences -- we can allow duplicates here.
2016-11-29 11:53:29 +01:00
func ( c * hwMonCollector ) hwmonHumanReadableChipName ( dir string ) ( string , error ) {
2022-07-27 20:59:39 +02:00
sysnameRaw , nameErr := os . ReadFile ( filepath . Join ( dir , "name" ) )
2016-11-29 11:53:29 +01:00
if nameErr != nil {
return "" , nameErr
}
if string ( sysnameRaw ) != "" {
cleanName := cleanMetricName ( string ( sysnameRaw ) )
if cleanName != "" {
return cleanName , nil
}
}
return "" , errors . New ( "Could not derive a human-readable chip type for " + dir )
}
2017-02-28 19:47:20 +01:00
func ( c * hwMonCollector ) Update ( ch chan <- prometheus . Metric ) error {
2016-10-06 17:33:24 +02:00
// Step 1: scan /sys/class/hwmon, resolve all symlinks and call
// updatesHwmon for each folder
2019-02-05 16:37:27 +01:00
hwmonPathName := filepath . Join ( sysFilePath ( "class" ) , "hwmon" )
2016-10-06 17:33:24 +02:00
2022-07-27 20:59:39 +02:00
hwmonFiles , err := os . ReadDir ( hwmonPathName )
2016-10-06 17:33:24 +02:00
if err != nil {
2020-06-15 22:27:14 +02:00
if errors . Is ( err , os . ErrNotExist ) {
2024-09-11 10:51:28 +02:00
c . logger . Debug ( "hwmon collector metrics are not available for this system" )
2020-02-19 16:11:29 +01:00
return ErrNoData
2017-01-17 17:24:28 +01:00
}
2016-10-06 17:33:24 +02:00
return err
}
2024-02-07 15:06:24 +01:00
var lastErr error
2016-10-06 17:33:24 +02:00
for _ , hwDir := range hwmonFiles {
2019-02-05 16:37:27 +01:00
hwmonXPathName := filepath . Join ( hwmonPathName , hwDir . Name ( ) )
2024-02-03 10:13:12 +01:00
fileInfo , err := os . Lstat ( hwmonXPathName )
if err != nil {
continue
}
2016-10-06 17:33:24 +02:00
2022-07-27 20:59:39 +02:00
if fileInfo . Mode ( ) & os . ModeSymlink > 0 {
fileInfo , err = os . Stat ( hwmonXPathName )
2016-10-06 17:33:24 +02:00
if err != nil {
continue
}
}
2022-07-27 20:59:39 +02:00
if ! fileInfo . IsDir ( ) {
2016-10-06 17:33:24 +02:00
continue
}
2024-02-07 15:06:24 +01:00
if err = c . updateHwmon ( ch , hwmonXPathName ) ; err != nil {
lastErr = err
2016-10-06 17:33:24 +02:00
}
}
2024-02-07 15:06:24 +01:00
return lastErr
2016-10-06 17:33:24 +02:00
}