2019-09-23 18:18:35 +02:00
// Copyright 2017-2019 The Prometheus Authors
2017-02-07 17:46:51 +01:00
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
2023-08-03 15:29:03 +02:00
//go:build !noinfiniband
// +build !noinfiniband
2017-02-07 17:46:51 +01:00
package collector
import (
2020-06-15 22:27:14 +02:00
"errors"
2019-09-23 18:18:35 +02:00
"fmt"
2024-09-11 10:51:28 +02:00
"log/slog"
2020-02-08 17:18:17 +01:00
"os"
2019-09-23 18:18:35 +02:00
"strconv"
2017-02-07 17:46:51 +01:00
"github.com/prometheus/client_golang/prometheus"
2019-09-23 18:18:35 +02:00
"github.com/prometheus/procfs/sysfs"
2017-02-07 17:46:51 +01:00
)
type infinibandCollector struct {
2019-09-23 18:18:35 +02:00
fs sysfs . FS
metricDescs map [ string ] * prometheus . Desc
2024-09-11 10:51:28 +02:00
logger * slog . Logger
2020-02-19 15:18:44 +01:00
subsystem string
2017-02-07 17:46:51 +01:00
}
func init ( ) {
2017-09-28 15:06:26 +02:00
registerCollector ( "infiniband" , defaultEnabled , NewInfiniBandCollector )
2017-02-07 17:46:51 +01:00
}
2017-02-28 17:44:53 +01:00
// NewInfiniBandCollector returns a new Collector exposing InfiniBand stats.
2024-09-11 10:51:28 +02:00
func NewInfiniBandCollector ( logger * slog . Logger ) ( Collector , error ) {
2017-02-07 17:46:51 +01:00
var i infinibandCollector
2019-09-23 18:18:35 +02:00
var err error
2017-02-07 17:46:51 +01:00
2019-09-23 18:18:35 +02:00
i . fs , err = sysfs . NewFS ( * sysPath )
if err != nil {
2019-11-29 14:51:31 +01:00
return nil , fmt . Errorf ( "failed to open sysfs: %w" , err )
2017-02-07 17:46:51 +01:00
}
2019-12-31 17:19:37 +01:00
i . logger = logger
2017-02-07 17:46:51 +01:00
2019-09-23 18:18:35 +02:00
// Detailed description for all metrics.
descriptions := map [ string ] string {
"legacy_multicast_packets_received_total" : "Number of multicast packets received" ,
"legacy_multicast_packets_transmitted_total" : "Number of multicast packets transmitted" ,
"legacy_data_received_bytes_total" : "Number of data octets received on all links" ,
"legacy_packets_received_total" : "Number of data packets received on all links" ,
"legacy_unicast_packets_received_total" : "Number of unicast packets received" ,
"legacy_unicast_packets_transmitted_total" : "Number of unicast packets transmitted" ,
"legacy_data_transmitted_bytes_total" : "Number of data octets transmitted on all links" ,
"legacy_packets_transmitted_total" : "Number of data packets received on all links" ,
2021-04-05 18:00:48 +02:00
"excessive_buffer_overrun_errors_total" : "Number of times that OverrunErrors consecutive flow control update periods occurred, each having at least one overrun error." ,
2019-09-23 18:18:35 +02:00
"link_downed_total" : "Number of times the link failed to recover from an error state and went down" ,
"link_error_recovery_total" : "Number of times the link successfully recovered from an error state" ,
2021-04-05 18:00:48 +02:00
"local_link_integrity_errors_total" : "Number of times that the count of local physical errors exceeded the threshold specified by LocalPhyErrors." ,
2019-09-23 18:18:35 +02:00
"multicast_packets_received_total" : "Number of multicast packets received (including errors)" ,
"multicast_packets_transmitted_total" : "Number of multicast packets transmitted (including errors)" ,
2019-11-22 22:52:17 +01:00
"physical_state_id" : "Physical state of the InfiniBand port (0: no change, 1: sleep, 2: polling, 3: disable, 4: shift, 5: link up, 6: link error recover, 7: phytest)" ,
2019-09-23 18:18:35 +02:00
"port_constraint_errors_received_total" : "Number of packets received on the switch physical port that are discarded" ,
"port_constraint_errors_transmitted_total" : "Number of packets not transmitted from the switch physical port" ,
"port_data_received_bytes_total" : "Number of data octets received on all links" ,
"port_data_transmitted_bytes_total" : "Number of data octets transmitted on all links" ,
"port_discards_received_total" : "Number of inbound packets discarded by the port because the port is down or congested" ,
"port_discards_transmitted_total" : "Number of outbound packets discarded by the port because the port is down or congested" ,
"port_errors_received_total" : "Number of packets containing an error that were received on this port" ,
"port_packets_received_total" : "Number of packets received on all VLs by this port (including errors)" ,
"port_packets_transmitted_total" : "Number of packets transmitted on all VLs from this port (including errors)" ,
"port_transmit_wait_total" : "Number of ticks during which the port had data to transmit but no data was sent during the entire tick" ,
2019-11-22 22:52:17 +01:00
"rate_bytes_per_second" : "Maximum signal transfer rate" ,
"state_id" : "State of the InfiniBand port (0: no change, 1: down, 2: init, 3: armed, 4: active, 5: act defer)" ,
2019-09-23 18:18:35 +02:00
"unicast_packets_received_total" : "Number of unicast packets received (including errors)" ,
"unicast_packets_transmitted_total" : "Number of unicast packets transmitted (including errors)" ,
2020-04-30 17:35:07 +02:00
"port_receive_remote_physical_errors_total" : "Number of packets marked with the EBP (End of Bad Packet) delimiter received on the port." ,
"port_receive_switch_relay_errors_total" : "Number of packets that could not be forwarded by the switch." ,
"symbol_error_total" : "Number of minor link errors detected on one or more physical lanes." ,
"vl15_dropped_total" : "Number of incoming VL15 packets dropped due to resource limitations." ,
2017-03-09 23:10:36 +01:00
}
2017-02-07 17:46:51 +01:00
i . metricDescs = make ( map [ string ] * prometheus . Desc )
2020-02-19 15:18:44 +01:00
i . subsystem = "infiniband"
2017-02-07 17:46:51 +01:00
2019-09-23 18:18:35 +02:00
for metricName , description := range descriptions {
2017-03-09 23:10:36 +01:00
i . metricDescs [ metricName ] = prometheus . NewDesc (
2020-02-19 15:18:44 +01:00
prometheus . BuildFQName ( namespace , i . subsystem , metricName ) ,
2019-09-23 18:18:35 +02:00
description ,
2017-03-09 23:10:36 +01:00
[ ] string { "device" , "port" } ,
nil ,
)
}
2017-02-07 17:46:51 +01:00
return & i , nil
}
2019-09-23 18:18:35 +02:00
func ( c * infinibandCollector ) pushMetric ( ch chan <- prometheus . Metric , name string , value uint64 , deviceName string , port string , valueType prometheus . ValueType ) {
ch <- prometheus . MustNewConstMetric ( c . metricDescs [ name ] , valueType , float64 ( value ) , deviceName , port )
2017-02-07 17:46:51 +01:00
}
2019-09-23 18:18:35 +02:00
func ( c * infinibandCollector ) pushCounter ( ch chan <- prometheus . Metric , name string , value * uint64 , deviceName string , port string ) {
if value != nil {
c . pushMetric ( ch , name , * value , deviceName , port , prometheus . CounterValue )
2017-05-12 07:28:53 +02:00
}
2017-02-07 17:46:51 +01:00
}
2017-02-28 19:47:20 +01:00
func ( c * infinibandCollector ) Update ( ch chan <- prometheus . Metric ) error {
2019-09-23 18:18:35 +02:00
devices , err := c . fs . InfiniBandClass ( )
if err != nil {
2020-06-15 22:27:14 +02:00
if errors . Is ( err , os . ErrNotExist ) {
2024-09-11 10:51:28 +02:00
c . logger . Debug ( "infiniband statistics not found, skipping" )
2020-02-19 16:11:29 +01:00
return ErrNoData
2020-02-08 17:18:17 +01:00
}
2020-06-15 22:27:14 +02:00
return fmt . Errorf ( "error obtaining InfiniBand class info: %w" , err )
2017-02-07 17:46:51 +01:00
}
for _ , device := range devices {
2020-02-19 15:18:44 +01:00
infoDesc := prometheus . NewDesc (
prometheus . BuildFQName ( namespace , c . subsystem , "info" ) ,
"Non-numeric data from /sys/class/infiniband/<device>, value is always 1." ,
[ ] string { "device" , "board_id" , "firmware_version" , "hca_type" } ,
nil ,
)
infoValue := 1.0
ch <- prometheus . MustNewConstMetric ( infoDesc , prometheus . GaugeValue , infoValue , device . Name , device . BoardID , device . FirmwareVersion , device . HCAType )
2019-09-23 18:18:35 +02:00
for _ , port := range device . Ports {
portStr := strconv . FormatUint ( uint64 ( port . Port ) , 10 )
2019-11-22 22:52:17 +01:00
c . pushMetric ( ch , "state_id" , uint64 ( port . StateID ) , port . Name , portStr , prometheus . GaugeValue )
c . pushMetric ( ch , "physical_state_id" , uint64 ( port . PhysStateID ) , port . Name , portStr , prometheus . GaugeValue )
c . pushMetric ( ch , "rate_bytes_per_second" , port . Rate , port . Name , portStr , prometheus . GaugeValue )
2019-09-23 18:18:35 +02:00
c . pushCounter ( ch , "legacy_multicast_packets_received_total" , port . Counters . LegacyPortMulticastRcvPackets , port . Name , portStr )
c . pushCounter ( ch , "legacy_multicast_packets_transmitted_total" , port . Counters . LegacyPortMulticastXmitPackets , port . Name , portStr )
c . pushCounter ( ch , "legacy_data_received_bytes_total" , port . Counters . LegacyPortRcvData64 , port . Name , portStr )
c . pushCounter ( ch , "legacy_packets_received_total" , port . Counters . LegacyPortRcvPackets64 , port . Name , portStr )
c . pushCounter ( ch , "legacy_unicast_packets_received_total" , port . Counters . LegacyPortUnicastRcvPackets , port . Name , portStr )
c . pushCounter ( ch , "legacy_unicast_packets_transmitted_total" , port . Counters . LegacyPortUnicastXmitPackets , port . Name , portStr )
c . pushCounter ( ch , "legacy_data_transmitted_bytes_total" , port . Counters . LegacyPortXmitData64 , port . Name , portStr )
c . pushCounter ( ch , "legacy_packets_transmitted_total" , port . Counters . LegacyPortXmitPackets64 , port . Name , portStr )
2021-04-05 18:00:48 +02:00
c . pushCounter ( ch , "excessive_buffer_overrun_errors_total" , port . Counters . ExcessiveBufferOverrunErrors , port . Name , portStr )
2019-09-23 18:18:35 +02:00
c . pushCounter ( ch , "link_downed_total" , port . Counters . LinkDowned , port . Name , portStr )
c . pushCounter ( ch , "link_error_recovery_total" , port . Counters . LinkErrorRecovery , port . Name , portStr )
2021-04-05 18:00:48 +02:00
c . pushCounter ( ch , "local_link_integrity_errors_total" , port . Counters . LocalLinkIntegrityErrors , port . Name , portStr )
2019-09-23 18:18:35 +02:00
c . pushCounter ( ch , "multicast_packets_received_total" , port . Counters . MulticastRcvPackets , port . Name , portStr )
c . pushCounter ( ch , "multicast_packets_transmitted_total" , port . Counters . MulticastXmitPackets , port . Name , portStr )
c . pushCounter ( ch , "port_constraint_errors_received_total" , port . Counters . PortRcvConstraintErrors , port . Name , portStr )
c . pushCounter ( ch , "port_constraint_errors_transmitted_total" , port . Counters . PortXmitConstraintErrors , port . Name , portStr )
c . pushCounter ( ch , "port_data_received_bytes_total" , port . Counters . PortRcvData , port . Name , portStr )
c . pushCounter ( ch , "port_data_transmitted_bytes_total" , port . Counters . PortXmitData , port . Name , portStr )
c . pushCounter ( ch , "port_discards_received_total" , port . Counters . PortRcvDiscards , port . Name , portStr )
c . pushCounter ( ch , "port_discards_transmitted_total" , port . Counters . PortXmitDiscards , port . Name , portStr )
c . pushCounter ( ch , "port_errors_received_total" , port . Counters . PortRcvErrors , port . Name , portStr )
c . pushCounter ( ch , "port_packets_received_total" , port . Counters . PortRcvPackets , port . Name , portStr )
c . pushCounter ( ch , "port_packets_transmitted_total" , port . Counters . PortXmitPackets , port . Name , portStr )
c . pushCounter ( ch , "port_transmit_wait_total" , port . Counters . PortXmitWait , port . Name , portStr )
c . pushCounter ( ch , "unicast_packets_received_total" , port . Counters . UnicastRcvPackets , port . Name , portStr )
c . pushCounter ( ch , "unicast_packets_transmitted_total" , port . Counters . UnicastXmitPackets , port . Name , portStr )
2020-04-30 17:35:07 +02:00
c . pushCounter ( ch , "port_receive_remote_physical_errors_total" , port . Counters . PortRcvRemotePhysicalErrors , port . Name , portStr )
c . pushCounter ( ch , "port_receive_switch_relay_errors_total" , port . Counters . PortRcvSwitchRelayErrors , port . Name , portStr )
c . pushCounter ( ch , "symbol_error_total" , port . Counters . SymbolError , port . Name , portStr )
c . pushCounter ( ch , "vl15_dropped_total" , port . Counters . VL15Dropped , port . Name , portStr )
2017-02-07 17:46:51 +01:00
}
}
return nil
}