From fe5a1178313b2f56e53d4194ae306373d92a208a Mon Sep 17 00:00:00 2001 From: Ben Kochie Date: Mon, 13 Aug 2018 17:27:23 +0200 Subject: [PATCH] Handle vanishing PIDs (#1043) PIDs can vanish (exit) from /proc/ between gathering the list of PIDs and getting all of their stats. * Ignore file not found errors. * Explicitly count the PIDs we find. * Cleanup some error style issues. Signed-off-by: Ben Kochie --- CHANGELOG.md | 2 +- collector/fixtures/proc/11/.missing_stat | 0 collector/processes_linux.go | 18 ++++++++++++++---- 3 files changed, 15 insertions(+), 5 deletions(-) create mode 100644 collector/fixtures/proc/11/.missing_stat diff --git a/CHANGELOG.md b/CHANGELOG.md index 4a447d66..7758cc90 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,9 +14,9 @@ The wifi collector is disabled by default due to suspected caching issues and go * [FEATURE] Add socket unit stats to systemd collector #968 * [FEATURE] Collect start time for systemd units * [ENHANCEMENT] -* [BUGFIX] * [BUGFIX] Fix goroutine leak in supervisord collector +* [BUGFIX] Handle vanishing PIDs #1043 ## 0.16.0 / 2018-05-15 diff --git a/collector/fixtures/proc/11/.missing_stat b/collector/fixtures/proc/11/.missing_stat new file mode 100644 index 00000000..e69de29b diff --git a/collector/processes_linux.go b/collector/processes_linux.go index fd6ddb1f..344844b6 100644 --- a/collector/processes_linux.go +++ b/collector/processes_linux.go @@ -17,7 +17,10 @@ package collector import ( "fmt" + "os" + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/common/log" "github.com/prometheus/procfs" ) @@ -62,13 +65,13 @@ func NewProcessStatCollector() (Collector, error) { func (t *processCollector) Update(ch chan<- prometheus.Metric) error { pids, states, threads, err := getAllocatedThreads() if err != nil { - return fmt.Errorf("Unable to retrieve number of allocated threads %v\n", err) + return fmt.Errorf("unable to retrieve number of allocated threads: %q", err) } ch <- prometheus.MustNewConstMetric(t.threadAlloc, prometheus.GaugeValue, float64(threads)) maxThreads, err := readUintFromFile(procFilePath("sys/kernel/threads-max")) if err != nil { - return fmt.Errorf("Unable to retrieve limit number of threads %v\n", err) + return fmt.Errorf("unable to retrieve limit number of threads: %q", err) } ch <- prometheus.MustNewConstMetric(t.threadLimit, prometheus.GaugeValue, float64(maxThreads)) @@ -78,7 +81,7 @@ func (t *processCollector) Update(ch chan<- prometheus.Metric) error { pidM, err := readUintFromFile(procFilePath("sys/kernel/pid_max")) if err != nil { - return fmt.Errorf("Unable to retrieve limit number of maximum pids alloved %v\n", err) + return fmt.Errorf("unable to retrieve limit number of maximum pids alloved: %q", err) } ch <- prometheus.MustNewConstMetric(t.pidUsed, prometheus.GaugeValue, float64(pids)) ch <- prometheus.MustNewConstMetric(t.pidMax, prometheus.GaugeValue, float64(pidM)) @@ -95,15 +98,22 @@ func getAllocatedThreads() (int, map[string]int32, int, error) { if err != nil { return 0, nil, 0, err } + pids := 0 thread := 0 procStates := make(map[string]int32) for _, pid := range p { stat, err := pid.NewStat() + // PIDs can vanish between getting the list and getting stats. + if os.IsNotExist(err) { + log.Debugf("file not found when retrieving stats: %q", err) + continue + } if err != nil { return 0, nil, 0, err } + pids += 1 procStates[stat.State] += 1 thread += stat.NumThreads } - return len(p), procStates, thread, nil + return pids, procStates, thread, nil }