2017-02-07 17:46:51 +01:00
// Copyright 2017 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// +build linux
// +build !noinfiniband
package collector
import (
"errors"
2017-03-09 18:05:36 +01:00
"os"
2017-02-07 17:46:51 +01:00
"path/filepath"
2018-10-04 15:05:59 +02:00
"strings"
2017-02-07 17:46:51 +01:00
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/common/log"
)
const infinibandPath = "class/infiniband"
var (
errInfinibandNoDevicesFound = errors . New ( "no InfiniBand devices detected" )
errInfinibandNoPortsFound = errors . New ( "no InfiniBand ports detected" )
)
type infinibandCollector struct {
2017-03-09 23:10:36 +01:00
metricDescs map [ string ] * prometheus . Desc
counters map [ string ] infinibandMetric
legacyCounters map [ string ] infinibandMetric
2017-02-07 17:46:51 +01:00
}
type infinibandMetric struct {
File string
Help string
}
func init ( ) {
2017-09-28 15:06:26 +02:00
registerCollector ( "infiniband" , defaultEnabled , NewInfiniBandCollector )
2017-02-07 17:46:51 +01:00
}
2017-02-28 17:44:53 +01:00
// NewInfiniBandCollector returns a new Collector exposing InfiniBand stats.
2017-02-07 17:46:51 +01:00
func NewInfiniBandCollector ( ) ( Collector , error ) {
var i infinibandCollector
// Filenames of all InfiniBand counter metrics including a detailed description.
i . counters = map [ string ] infinibandMetric {
2018-10-30 21:54:09 +01:00
"link_downed_total" : { "link_downed" , "Number of times the link failed to recover from an error state and went down" } ,
"link_error_recovery_total" : { "link_error_recovery" , "Number of times the link successfully recovered from an error state" } ,
"multicast_packets_received_total" : { "multicast_rcv_packets" , "Number of multicast packets received (including errors)" } ,
"multicast_packets_transmitted_total" : { "multicast_xmit_packets" , "Number of multicast packets transmitted (including errors)" } ,
"port_constraint_errors_received_total" : { "port_rcv_constraint_errors" , "Number of packets received on the switch physical port that are discarded" } ,
"port_constraint_errors_transmitted_total" : { "port_xmit_constraint_errors" , "Number of packets not transmitted from the switch physical port" } ,
"port_data_received_bytes_total" : { "port_rcv_data" , "Number of data octets received on all links" } ,
"port_data_transmitted_bytes_total" : { "port_xmit_data" , "Number of data octets transmitted on all links" } ,
"port_discards_received_total" : { "port_rcv_discards" , "Number of inbound packets discarded by the port because the port is down or congested" } ,
"port_discards_transmitted_total" : { "port_xmit_discards" , "Number of outbound packets discarded by the port because the port is down or congested" } ,
"port_errors_received_total" : { "port_rcv_errors" , "Number of packets containing an error that were received on this port" } ,
"port_packets_received_total" : { "port_rcv_packets" , "Number of packets received on all VLs by this port (including errors)" } ,
"port_packets_transmitted_total" : { "port_xmit_packets" , "Number of packets transmitted on all VLs from this port (including errors)" } ,
"port_transmit_wait_total" : { "port_xmit_wait" , "Number of ticks during which the port had data to transmit but no data was sent during the entire tick" } ,
"unicast_packets_received_total" : { "unicast_rcv_packets" , "Number of unicast packets received (including errors)" } ,
"unicast_packets_transmitted_total" : { "unicast_xmit_packets" , "Number of unicast packets transmitted (including errors)" } ,
2017-02-07 17:46:51 +01:00
}
2017-03-09 23:10:36 +01:00
// Deprecated counters for some older versions of InfiniBand drivers.
i . legacyCounters = map [ string ] infinibandMetric {
"legacy_multicast_packets_received_total" : { "port_multicast_rcv_packets" , "Number of multicast packets received" } ,
"legacy_multicast_packets_transmitted_total" : { "port_multicast_xmit_packets" , "Number of multicast packets transmitted" } ,
"legacy_data_received_bytes_total" : { "port_rcv_data_64" , "Number of data octets received on all links" } ,
"legacy_packets_received_total" : { "port_rcv_packets_64" , "Number of data packets received on all links" } ,
"legacy_unicast_packets_received_total" : { "port_unicast_rcv_packets" , "Number of unicast packets received" } ,
"legacy_unicast_packets_transmitted_total" : { "port_unicast_xmit_packets" , "Number of unicast packets transmitted" } ,
"legacy_data_transmitted_bytes_total" : { "port_xmit_data_64" , "Number of data octets transmitted on all links" } ,
"legacy_packets_transmitted_total" : { "port_xmit_packets_64" , "Number of data packets received on all links" } ,
}
2017-02-07 17:46:51 +01:00
subsystem := "infiniband"
i . metricDescs = make ( map [ string ] * prometheus . Desc )
for metricName , infinibandMetric := range i . counters {
i . metricDescs [ metricName ] = prometheus . NewDesc (
2017-09-28 15:06:26 +02:00
prometheus . BuildFQName ( namespace , subsystem , metricName ) ,
2017-02-07 17:46:51 +01:00
infinibandMetric . Help ,
[ ] string { "device" , "port" } ,
nil ,
)
}
2017-03-09 23:10:36 +01:00
for metricName , infinibandMetric := range i . legacyCounters {
i . metricDescs [ metricName ] = prometheus . NewDesc (
2017-09-28 15:06:26 +02:00
prometheus . BuildFQName ( namespace , subsystem , metricName ) ,
2017-03-09 23:10:36 +01:00
infinibandMetric . Help ,
[ ] string { "device" , "port" } ,
nil ,
)
}
2017-02-07 17:46:51 +01:00
return & i , nil
}
// infinibandDevices retrieves a list of InfiniBand devices.
func infinibandDevices ( infinibandPath string ) ( [ ] string , error ) {
devices , err := filepath . Glob ( filepath . Join ( infinibandPath , "/*" ) )
if err != nil {
return nil , err
}
if len ( devices ) < 1 {
log . Debugf ( "Unable to detect InfiniBand devices" )
err = errInfinibandNoDevicesFound
return nil , err
}
// Extract just the filenames which equate to the device names.
for i , device := range devices {
devices [ i ] = filepath . Base ( device )
}
return devices , nil
}
// Retrieve a list of ports for the InfiniBand device.
func infinibandPorts ( infinibandPath , device string ) ( [ ] string , error ) {
ports , err := filepath . Glob ( filepath . Join ( infinibandPath , device , "ports/*" ) )
if err != nil {
return nil , err
}
if len ( ports ) < 1 {
log . Debugf ( "Unable to detect ports for %s" , device )
err = errInfinibandNoPortsFound
return nil , err
}
// Extract just the filenames which equates to the port numbers.
for i , port := range ports {
ports [ i ] = filepath . Base ( port )
}
return ports , nil
}
func readMetric ( directory , metricFile string ) ( uint64 , error ) {
metric , err := readUintFromFile ( filepath . Join ( directory , metricFile ) )
if err != nil {
2018-10-04 15:05:59 +02:00
// Ugly workaround for handling #966, when counters are
// `N/A (not available)`.
// This was already patched and submitted, see
// https://www.spinics.net/lists/linux-rdma/msg68596.html
// Remove this as soon as the fix lands in the enterprise distros.
if strings . Contains ( err . Error ( ) , "N/A (no PMA)" ) {
log . Debugf ( "%q value is N/A" , metricFile )
return 0 , nil
}
2017-02-07 17:46:51 +01:00
log . Debugf ( "Error reading %q file" , metricFile )
return 0 , err
}
2017-05-12 07:28:53 +02:00
// According to Mellanox, the following metrics "are divided by 4 unconditionally"
// as they represent the amount of data being transmitted and received per lane.
// Mellanox cards have 4 lanes per port, so all values must be multiplied by 4
// to get the expected value.
switch metricFile {
case "port_rcv_data" , "port_xmit_data" , "port_rcv_data_64" , "port_xmit_data_64" :
metric *= 4
}
2017-02-07 17:46:51 +01:00
return metric , nil
}
2017-02-28 19:47:20 +01:00
func ( c * infinibandCollector ) Update ( ch chan <- prometheus . Metric ) error {
2017-02-07 17:46:51 +01:00
devices , err := infinibandDevices ( sysFilePath ( infinibandPath ) )
// If no devices are found or another error is raised while attempting to find devices,
// InfiniBand is likely not installed and the collector should be skipped.
switch err {
case nil :
case errInfinibandNoDevicesFound :
return nil
default :
return err
}
for _ , device := range devices {
ports , err := infinibandPorts ( sysFilePath ( infinibandPath ) , device )
// If no ports are found for the specified device, skip to the next device.
switch err {
case nil :
case errInfinibandNoPortsFound :
continue
default :
return err
}
for _ , port := range ports {
portFiles := sysFilePath ( filepath . Join ( infinibandPath , device , "ports" , port ) )
// Add metrics for the InfiniBand counters.
for metricName , infinibandMetric := range c . counters {
2017-03-09 18:05:36 +01:00
if _ , err := os . Stat ( filepath . Join ( portFiles , "counters" , infinibandMetric . File ) ) ; os . IsNotExist ( err ) {
continue
}
2017-02-07 17:46:51 +01:00
metric , err := readMetric ( filepath . Join ( portFiles , "counters" ) , infinibandMetric . File )
if err != nil {
return err
}
ch <- prometheus . MustNewConstMetric (
c . metricDescs [ metricName ] ,
prometheus . CounterValue ,
float64 ( metric ) ,
device ,
port ,
)
}
2017-03-09 23:10:36 +01:00
// Add metrics for the legacy InfiniBand counters.
for metricName , infinibandMetric := range c . legacyCounters {
if _ , err := os . Stat ( filepath . Join ( portFiles , "counters_ext" , infinibandMetric . File ) ) ; os . IsNotExist ( err ) {
continue
}
metric , err := readMetric ( filepath . Join ( portFiles , "counters_ext" ) , infinibandMetric . File )
if err != nil {
return err
}
ch <- prometheus . MustNewConstMetric (
c . metricDescs [ metricName ] ,
prometheus . CounterValue ,
float64 ( metric ) ,
device ,
port ,
)
}
2017-02-07 17:46:51 +01:00
}
}
return nil
}