2017-06-13 11:21:53 +02:00
// Copyright 2015 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// +build !nocpu
package collector
import (
"fmt"
"path/filepath"
2020-07-17 18:32:23 +02:00
"regexp"
2018-02-27 19:43:15 +01:00
"strconv"
2020-05-23 21:46:54 +02:00
"sync"
2017-06-13 11:21:53 +02:00
2020-11-14 11:53:51 +01:00
"github.com/go-kit/log"
"github.com/go-kit/log/level"
2017-06-13 11:21:53 +02:00
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/procfs"
2019-12-31 17:19:37 +01:00
"gopkg.in/alecthomas/kingpin.v2"
2017-06-13 11:21:53 +02:00
)
type cpuCollector struct {
2019-04-10 18:16:12 +02:00
fs procfs . FS
2017-06-13 11:21:53 +02:00
cpu * prometheus . Desc
2019-09-11 23:06:36 +02:00
cpuInfo * prometheus . Desc
2020-07-17 18:32:23 +02:00
cpuFlagsInfo * prometheus . Desc
cpuBugsInfo * prometheus . Desc
2017-11-23 15:04:47 +01:00
cpuGuest * prometheus . Desc
2017-06-13 11:21:53 +02:00
cpuCoreThrottle * prometheus . Desc
cpuPackageThrottle * prometheus . Desc
2019-12-31 17:19:37 +01:00
logger log . Logger
2020-05-23 21:46:54 +02:00
cpuStats [ ] procfs . CPUStat
cpuStatsMutex sync . Mutex
2020-07-17 18:32:23 +02:00
cpuFlagsIncludeRegexp * regexp . Regexp
cpuBugsIncludeRegexp * regexp . Regexp
2017-06-13 11:21:53 +02:00
}
2021-07-04 10:47:04 +02:00
// Idle jump back limit in seconds.
const jumpBackSeconds = 3.0
2019-09-11 23:06:36 +02:00
var (
2021-07-04 10:47:04 +02:00
enableCPUInfo = kingpin . Flag ( "collector.cpu.info" , "Enables metric cpu_info" ) . Bool ( )
flagsInclude = kingpin . Flag ( "collector.cpu.info.flags-include" , "Filter the `flags` field in cpuInfo with a value that must be a regular expression" ) . String ( )
bugsInclude = kingpin . Flag ( "collector.cpu.info.bugs-include" , "Filter the `bugs` field in cpuInfo with a value that must be a regular expression" ) . String ( )
jumpBackDebugMessage = fmt . Sprintf ( "CPU Idle counter jumped backwards more than %f seconds, possible hotplug event, resetting CPU stats" , jumpBackSeconds )
2019-09-11 23:06:36 +02:00
)
2017-06-13 11:21:53 +02:00
func init ( ) {
2017-09-28 15:06:26 +02:00
registerCollector ( "cpu" , defaultEnabled , NewCPUCollector )
2017-06-13 11:21:53 +02:00
}
// NewCPUCollector returns a new Collector exposing kernel/system statistics.
2019-12-31 17:19:37 +01:00
func NewCPUCollector ( logger log . Logger ) ( Collector , error ) {
2019-04-10 18:16:12 +02:00
fs , err := procfs . NewFS ( * procPath )
if err != nil {
2019-11-29 14:51:31 +01:00
return nil , fmt . Errorf ( "failed to open procfs: %w" , err )
2019-04-10 18:16:12 +02:00
}
2020-07-17 18:32:23 +02:00
c := & cpuCollector {
2019-04-10 18:16:12 +02:00
fs : fs ,
2018-04-29 14:34:47 +02:00
cpu : nodeCPUSecondsDesc ,
2019-09-11 23:06:36 +02:00
cpuInfo : prometheus . NewDesc (
prometheus . BuildFQName ( namespace , cpuCollectorSubsystem , "info" ) ,
"CPU information from /proc/cpuinfo." ,
2020-02-20 17:36:02 +01:00
[ ] string { "package" , "core" , "cpu" , "vendor" , "family" , "model" , "model_name" , "microcode" , "stepping" , "cachesize" } , nil ,
2019-09-11 23:06:36 +02:00
) ,
2020-07-17 18:32:23 +02:00
cpuFlagsInfo : prometheus . NewDesc (
prometheus . BuildFQName ( namespace , cpuCollectorSubsystem , "flag_info" ) ,
"The `flags` field of CPU information from /proc/cpuinfo." ,
[ ] string { "flag" } , nil ,
) ,
cpuBugsInfo : prometheus . NewDesc (
prometheus . BuildFQName ( namespace , cpuCollectorSubsystem , "bug_info" ) ,
"The `bugs` field of CPU information from /proc/cpuinfo." ,
[ ] string { "bug" } , nil ,
) ,
2017-11-23 15:04:47 +01:00
cpuGuest : prometheus . NewDesc (
prometheus . BuildFQName ( namespace , cpuCollectorSubsystem , "guest_seconds_total" ) ,
2020-09-03 19:39:19 +02:00
"Seconds the CPUs spent in guests (VMs) for each mode." ,
2017-11-23 15:04:47 +01:00
[ ] string { "cpu" , "mode" } , nil ,
) ,
2017-06-13 11:21:53 +02:00
cpuCoreThrottle : prometheus . NewDesc (
2017-09-28 15:06:26 +02:00
prometheus . BuildFQName ( namespace , cpuCollectorSubsystem , "core_throttles_total" ) ,
2020-09-03 19:39:19 +02:00
"Number of times this CPU core has been throttled." ,
2018-04-09 18:01:52 +02:00
[ ] string { "package" , "core" } , nil ,
2017-06-13 11:21:53 +02:00
) ,
cpuPackageThrottle : prometheus . NewDesc (
2017-09-28 15:06:26 +02:00
prometheus . BuildFQName ( namespace , cpuCollectorSubsystem , "package_throttles_total" ) ,
2020-09-03 19:39:19 +02:00
"Number of times this CPU package has been throttled." ,
2018-04-09 18:01:52 +02:00
[ ] string { "package" } , nil ,
2017-06-13 11:21:53 +02:00
) ,
2019-12-31 17:19:37 +01:00
logger : logger ,
2020-07-17 18:32:23 +02:00
}
err = c . compileIncludeFlags ( flagsInclude , bugsInclude )
if err != nil {
return nil , fmt . Errorf ( "fail to compile --collector.cpu.info.flags-include and --collector.cpu.info.bugs-include, the values of them must be regular expressions: %w" , err )
}
return c , nil
}
func ( c * cpuCollector ) compileIncludeFlags ( flagsIncludeFlag , bugsIncludeFlag * string ) error {
if ( * flagsIncludeFlag != "" || * bugsIncludeFlag != "" ) && ! * enableCPUInfo {
* enableCPUInfo = true
level . Info ( c . logger ) . Log ( "msg" , "--collector.cpu.info has been set to `true` because you set the following flags, like --collector.cpu.info.flags-include and --collector.cpu.info.bugs-include" )
}
var err error
if * flagsIncludeFlag != "" {
c . cpuFlagsIncludeRegexp , err = regexp . Compile ( * flagsIncludeFlag )
if err != nil {
return err
}
}
if * bugsIncludeFlag != "" {
c . cpuBugsIncludeRegexp , err = regexp . Compile ( * bugsIncludeFlag )
if err != nil {
return err
}
}
return nil
2017-06-13 11:21:53 +02:00
}
// Update implements Collector and exposes cpu related metrics from /proc/stat and /sys/.../cpu/.
func ( c * cpuCollector ) Update ( ch chan <- prometheus . Metric ) error {
2019-09-11 23:06:36 +02:00
if * enableCPUInfo {
if err := c . updateInfo ( ch ) ; err != nil {
return err
}
}
2017-06-13 11:21:53 +02:00
if err := c . updateStat ( ch ) ; err != nil {
return err
}
2018-10-18 17:28:19 +02:00
if err := c . updateThermalThrottle ( ch ) ; err != nil {
return err
}
2017-06-13 11:21:53 +02:00
return nil
}
2019-09-11 23:06:36 +02:00
// updateInfo reads /proc/cpuinfo
func ( c * cpuCollector ) updateInfo ( ch chan <- prometheus . Metric ) error {
info , err := c . fs . CPUInfo ( )
if err != nil {
return err
}
for _ , cpu := range info {
ch <- prometheus . MustNewConstMetric ( c . cpuInfo ,
prometheus . GaugeValue ,
1 ,
cpu . PhysicalID ,
cpu . CoreID ,
2020-02-19 14:34:05 +01:00
strconv . Itoa ( int ( cpu . Processor ) ) ,
2019-09-11 23:06:36 +02:00
cpu . VendorID ,
cpu . CPUFamily ,
cpu . Model ,
2020-02-20 17:36:02 +01:00
cpu . ModelName ,
2019-09-11 23:06:36 +02:00
cpu . Microcode ,
2020-02-20 17:36:02 +01:00
cpu . Stepping ,
2019-09-11 23:06:36 +02:00
cpu . CacheSize )
2020-07-17 18:32:23 +02:00
if err := updateFieldInfo ( cpu . Flags , c . cpuFlagsIncludeRegexp , c . cpuFlagsInfo , ch ) ; err != nil {
return err
}
if err := updateFieldInfo ( cpu . Bugs , c . cpuBugsIncludeRegexp , c . cpuBugsInfo , ch ) ; err != nil {
return err
}
}
return nil
}
func updateFieldInfo ( valueList [ ] string , filter * regexp . Regexp , desc * prometheus . Desc , ch chan <- prometheus . Metric ) error {
if filter == nil {
return nil
}
for _ , val := range valueList {
if ! filter . MatchString ( val ) {
continue
}
ch <- prometheus . MustNewConstMetric ( desc ,
prometheus . GaugeValue ,
1 ,
val ,
)
2019-09-11 23:06:36 +02:00
}
return nil
}
2018-10-18 17:28:19 +02:00
// updateThermalThrottle reads /sys/devices/system/cpu/cpu* and expose thermal throttle statistics.
func ( c * cpuCollector ) updateThermalThrottle ( ch chan <- prometheus . Metric ) error {
2018-04-09 18:01:52 +02:00
cpus , err := filepath . Glob ( sysFilePath ( "devices/system/cpu/cpu[0-9]*" ) )
2017-06-13 11:21:53 +02:00
if err != nil {
return err
}
2018-04-09 18:01:52 +02:00
packageThrottles := make ( map [ uint64 ] uint64 )
packageCoreThrottles := make ( map [ uint64 ] map [ uint64 ] uint64 )
2018-02-27 19:43:15 +01:00
2017-09-07 23:24:18 +02:00
// cpu loop
2017-06-13 11:21:53 +02:00
for _ , cpu := range cpus {
2018-04-09 18:01:52 +02:00
// See
// https://www.kernel.org/doc/Documentation/x86/topology.txt
// https://www.kernel.org/doc/Documentation/cputopology.txt
// https://www.kernel.org/doc/Documentation/ABI/testing/sysfs-devices-system-cpu
var err error
var physicalPackageID , coreID uint64
// topology/physical_package_id
if physicalPackageID , err = readUintFromFile ( filepath . Join ( cpu , "topology" , "physical_package_id" ) ) ; err != nil {
2019-12-31 17:19:37 +01:00
level . Debug ( c . logger ) . Log ( "msg" , "CPU is missing physical_package_id" , "cpu" , cpu )
2018-04-09 18:01:52 +02:00
continue
}
// topology/core_id
if coreID , err = readUintFromFile ( filepath . Join ( cpu , "topology" , "core_id" ) ) ; err != nil {
2019-12-31 17:19:37 +01:00
level . Debug ( c . logger ) . Log ( "msg" , "CPU is missing core_id" , "cpu" , cpu )
2017-09-07 23:24:18 +02:00
continue
}
2018-02-27 19:43:15 +01:00
2018-04-09 18:01:52 +02:00
// metric node_cpu_core_throttles_total
//
// We process this metric before the package throttles as there
2020-09-03 19:39:19 +02:00
// are CPU+kernel combinations that only present core throttles
2018-04-09 18:01:52 +02:00
// but no package throttles.
// Seen e.g. on an Intel Xeon E5472 system with RHEL 6.9 kernel.
if _ , present := packageCoreThrottles [ physicalPackageID ] ; ! present {
packageCoreThrottles [ physicalPackageID ] = make ( map [ uint64 ] uint64 )
}
if _ , present := packageCoreThrottles [ physicalPackageID ] [ coreID ] ; ! present {
// Read thermal_throttle/core_throttle_count only once
if coreThrottleCount , err := readUintFromFile ( filepath . Join ( cpu , "thermal_throttle" , "core_throttle_count" ) ) ; err == nil {
packageCoreThrottles [ physicalPackageID ] [ coreID ] = coreThrottleCount
} else {
2019-12-31 17:19:37 +01:00
level . Debug ( c . logger ) . Log ( "msg" , "CPU is missing core_throttle_count" , "cpu" , cpu )
2018-02-27 19:43:15 +01:00
}
2017-09-07 23:24:18 +02:00
}
2018-02-27 19:43:15 +01:00
2018-04-09 18:01:52 +02:00
// metric node_cpu_package_throttles_total
if _ , present := packageThrottles [ physicalPackageID ] ; ! present {
// Read thermal_throttle/package_throttle_count only once
if packageThrottleCount , err := readUintFromFile ( filepath . Join ( cpu , "thermal_throttle" , "package_throttle_count" ) ) ; err == nil {
packageThrottles [ physicalPackageID ] = packageThrottleCount
} else {
2019-12-31 17:19:37 +01:00
level . Debug ( c . logger ) . Log ( "msg" , "CPU is missing package_throttle_count" , "cpu" , cpu )
2018-04-09 18:01:52 +02:00
}
}
2017-09-07 23:24:18 +02:00
}
2017-06-20 07:51:26 +02:00
2018-04-09 18:01:52 +02:00
for physicalPackageID , packageThrottleCount := range packageThrottles {
ch <- prometheus . MustNewConstMetric ( c . cpuPackageThrottle ,
prometheus . CounterValue ,
float64 ( packageThrottleCount ) ,
strconv . FormatUint ( physicalPackageID , 10 ) )
2017-09-07 23:24:18 +02:00
}
2018-04-29 14:34:47 +02:00
for physicalPackageID , coreMap := range packageCoreThrottles {
for coreID , coreThrottleCount := range coreMap {
2018-04-09 18:01:52 +02:00
ch <- prometheus . MustNewConstMetric ( c . cpuCoreThrottle ,
prometheus . CounterValue ,
float64 ( coreThrottleCount ) ,
strconv . FormatUint ( physicalPackageID , 10 ) ,
strconv . FormatUint ( coreID , 10 ) )
2017-09-07 23:24:18 +02:00
}
2017-06-13 11:21:53 +02:00
}
return nil
}
2020-09-03 19:39:19 +02:00
// updateStat reads /proc/stat through procfs and exports CPU-related metrics.
2017-06-13 11:21:53 +02:00
func ( c * cpuCollector ) updateStat ( ch chan <- prometheus . Metric ) error {
2019-06-12 20:47:16 +02:00
stats , err := c . fs . Stat ( )
2017-06-13 11:21:53 +02:00
if err != nil {
return err
}
2020-05-23 21:46:54 +02:00
c . updateCPUStats ( stats . CPU )
// Acquire a lock to read the stats.
c . cpuStatsMutex . Lock ( )
defer c . cpuStatsMutex . Unlock ( )
for cpuID , cpuStat := range c . cpuStats {
2020-02-19 14:34:05 +01:00
cpuNum := strconv . Itoa ( cpuID )
2018-02-01 18:42:20 +01:00
ch <- prometheus . MustNewConstMetric ( c . cpu , prometheus . CounterValue , cpuStat . User , cpuNum , "user" )
ch <- prometheus . MustNewConstMetric ( c . cpu , prometheus . CounterValue , cpuStat . Nice , cpuNum , "nice" )
ch <- prometheus . MustNewConstMetric ( c . cpu , prometheus . CounterValue , cpuStat . System , cpuNum , "system" )
ch <- prometheus . MustNewConstMetric ( c . cpu , prometheus . CounterValue , cpuStat . Idle , cpuNum , "idle" )
ch <- prometheus . MustNewConstMetric ( c . cpu , prometheus . CounterValue , cpuStat . Iowait , cpuNum , "iowait" )
ch <- prometheus . MustNewConstMetric ( c . cpu , prometheus . CounterValue , cpuStat . IRQ , cpuNum , "irq" )
ch <- prometheus . MustNewConstMetric ( c . cpu , prometheus . CounterValue , cpuStat . SoftIRQ , cpuNum , "softirq" )
ch <- prometheus . MustNewConstMetric ( c . cpu , prometheus . CounterValue , cpuStat . Steal , cpuNum , "steal" )
2017-11-23 15:04:47 +01:00
// Guest CPU is also accounted for in cpuStat.User and cpuStat.Nice, expose these as separate metrics.
ch <- prometheus . MustNewConstMetric ( c . cpuGuest , prometheus . CounterValue , cpuStat . Guest , cpuNum , "user" )
ch <- prometheus . MustNewConstMetric ( c . cpuGuest , prometheus . CounterValue , cpuStat . GuestNice , cpuNum , "nice" )
2017-06-13 11:21:53 +02:00
}
return nil
}
2020-05-23 21:46:54 +02:00
// updateCPUStats updates the internal cache of CPU stats.
func ( c * cpuCollector ) updateCPUStats ( newStats [ ] procfs . CPUStat ) {
2021-07-04 10:47:04 +02:00
2020-05-23 21:46:54 +02:00
// Acquire a lock to update the stats.
c . cpuStatsMutex . Lock ( )
defer c . cpuStatsMutex . Unlock ( )
// Reset the cache if the list of CPUs has changed.
if len ( c . cpuStats ) != len ( newStats ) {
c . cpuStats = make ( [ ] procfs . CPUStat , len ( newStats ) )
}
for i , n := range newStats {
2021-07-04 10:47:04 +02:00
// If idle jumps backwards by more than X seconds, assume we had a hotplug event and reset the stats for this CPU.
if ( c . cpuStats [ i ] . Idle - n . Idle ) >= jumpBackSeconds {
level . Debug ( c . logger ) . Log ( "msg" , jumpBackDebugMessage , "cpu" , i , "old_value" , c . cpuStats [ i ] . Idle , "new_value" , n . Idle )
2020-05-23 21:46:54 +02:00
c . cpuStats [ i ] = procfs . CPUStat { }
}
2021-07-04 10:47:04 +02:00
if n . Idle >= c . cpuStats [ i ] . Idle {
c . cpuStats [ i ] . Idle = n . Idle
} else {
level . Debug ( c . logger ) . Log ( "msg" , "CPU Idle counter jumped backwards" , "cpu" , i , "old_value" , c . cpuStats [ i ] . Idle , "new_value" , n . Idle )
}
2020-05-23 21:46:54 +02:00
if n . User >= c . cpuStats [ i ] . User {
c . cpuStats [ i ] . User = n . User
} else {
2020-09-25 08:42:51 +02:00
level . Debug ( c . logger ) . Log ( "msg" , "CPU User counter jumped backwards" , "cpu" , i , "old_value" , c . cpuStats [ i ] . User , "new_value" , n . User )
2020-05-23 21:46:54 +02:00
}
if n . Nice >= c . cpuStats [ i ] . Nice {
c . cpuStats [ i ] . Nice = n . Nice
} else {
2020-09-25 08:42:51 +02:00
level . Debug ( c . logger ) . Log ( "msg" , "CPU Nice counter jumped backwards" , "cpu" , i , "old_value" , c . cpuStats [ i ] . Nice , "new_value" , n . Nice )
2020-05-23 21:46:54 +02:00
}
if n . System >= c . cpuStats [ i ] . System {
c . cpuStats [ i ] . System = n . System
} else {
2020-09-25 08:42:51 +02:00
level . Debug ( c . logger ) . Log ( "msg" , "CPU System counter jumped backwards" , "cpu" , i , "old_value" , c . cpuStats [ i ] . System , "new_value" , n . System )
2020-05-23 21:46:54 +02:00
}
if n . Iowait >= c . cpuStats [ i ] . Iowait {
c . cpuStats [ i ] . Iowait = n . Iowait
} else {
2020-09-25 08:42:51 +02:00
level . Debug ( c . logger ) . Log ( "msg" , "CPU Iowait counter jumped backwards" , "cpu" , i , "old_value" , c . cpuStats [ i ] . Iowait , "new_value" , n . Iowait )
2020-05-23 21:46:54 +02:00
}
if n . IRQ >= c . cpuStats [ i ] . IRQ {
c . cpuStats [ i ] . IRQ = n . IRQ
} else {
2020-09-25 08:42:51 +02:00
level . Debug ( c . logger ) . Log ( "msg" , "CPU IRQ counter jumped backwards" , "cpu" , i , "old_value" , c . cpuStats [ i ] . IRQ , "new_value" , n . IRQ )
2020-05-23 21:46:54 +02:00
}
if n . SoftIRQ >= c . cpuStats [ i ] . SoftIRQ {
c . cpuStats [ i ] . SoftIRQ = n . SoftIRQ
} else {
2020-09-25 08:42:51 +02:00
level . Debug ( c . logger ) . Log ( "msg" , "CPU SoftIRQ counter jumped backwards" , "cpu" , i , "old_value" , c . cpuStats [ i ] . SoftIRQ , "new_value" , n . SoftIRQ )
2020-05-23 21:46:54 +02:00
}
if n . Steal >= c . cpuStats [ i ] . Steal {
c . cpuStats [ i ] . Steal = n . Steal
} else {
2020-09-25 08:42:51 +02:00
level . Debug ( c . logger ) . Log ( "msg" , "CPU Steal counter jumped backwards" , "cpu" , i , "old_value" , c . cpuStats [ i ] . Steal , "new_value" , n . Steal )
2020-05-23 21:46:54 +02:00
}
if n . Guest >= c . cpuStats [ i ] . Guest {
c . cpuStats [ i ] . Guest = n . Guest
} else {
2020-09-25 08:42:51 +02:00
level . Debug ( c . logger ) . Log ( "msg" , "CPU Guest counter jumped backwards" , "cpu" , i , "old_value" , c . cpuStats [ i ] . Guest , "new_value" , n . Guest )
2020-05-23 21:46:54 +02:00
}
if n . GuestNice >= c . cpuStats [ i ] . GuestNice {
c . cpuStats [ i ] . GuestNice = n . GuestNice
} else {
2020-09-25 08:42:51 +02:00
level . Debug ( c . logger ) . Log ( "msg" , "CPU GuestNice counter jumped backwards" , "cpu" , i , "old_value" , c . cpuStats [ i ] . GuestNice , "new_value" , n . GuestNice )
2020-05-23 21:46:54 +02:00
}
}
}