From da5972b5398cd67a8854e6b1ee6b861bfda5ef83 Mon Sep 17 00:00:00 2001 From: Peter Bueschel Date: Tue, 31 Mar 2020 10:46:32 +0200 Subject: [PATCH] Add gauges for allocated memory for queued UDP and TCP packages (#1503) * Two new states will be added to the tcpstat collector called rx_queued_bytes and tx_queued_bytes. For UDP datagrams an additional collector 'udp_queues' can be used to expose the total lengths of the tx_queue and rx_queue. @SuperQ and @discordianfish this changes gives us the option to check for overloaded UDP + TCP processing. The names of the new TCP states and the UDP metric can be discussed. The current reasons are just: I don't want to add another collector for the same exposed file, so I just added the new states to the tcpstat collector. I chose the name 'udp_queue' instead of 'udpstat' as UDP has no state. Signed-off-by: Peter Bueschel --- CHANGELOG.md | 1 + README.md | 1 + collector/fixtures/e2e-output.txt | 5 ++ collector/fixtures/proc/net/tcpstat | 4 +- collector/fixtures/proc/net/udp | 2 + collector/tcpstat_linux.go | 28 +++++++++- collector/tcpstat_linux_test.go | 64 ++++++++++++++++++++- collector/udp_queues_linux.go | 87 +++++++++++++++++++++++++++++ end-to-end-test.sh | 1 + 9 files changed, 189 insertions(+), 4 deletions(-) create mode 100644 collector/fixtures/proc/net/udp create mode 100644 collector/udp_queues_linux.go diff --git a/CHANGELOG.md b/CHANGELOG.md index 8753f0df..d78f0056 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -41,6 +41,7 @@ * [FEATURE] Add Btrfs collector #1512 * [FEATURE] Add RAPL collector #1523 * [FEATURE] Add new softnet collector #1576 +* [FEATURE] Add new udp_queues collector #1503 * [ENHANCEMENT] Log pid when there is a problem reading the process stats #1341 * [ENHANCEMENT] Collect InfiniBand port state and physical state #1357 * [ENHANCEMENT] Include additional XFS runtime statistics. #1423 diff --git a/README.md b/README.md index 13323729..3a1546e6 100644 --- a/README.md +++ b/README.md @@ -60,6 +60,7 @@ textfile | Exposes statistics read from local disk. The `--collector.textfile.di thermal\_zone | Exposes thermal zone & cooling device statistics from `/sys/class/thermal`. | Linux time | Exposes the current system time. | _any_ timex | Exposes selected adjtimex(2) system call stats. | Linux +udp_queues | Exposes UDP total lengths of the rx_queue and tx_queue from `/proc/net/udp` and `/proc/net/udp6`. | Linux uname | Exposes system information as provided by the uname system call. | Darwin, FreeBSD, Linux, OpenBSD vmstat | Exposes statistics from `/proc/vmstat`. | Linux xfs | Exposes XFS runtime statistics. | Linux (kernel 4.4+) diff --git a/collector/fixtures/e2e-output.txt b/collector/fixtures/e2e-output.txt index 907b4201..5a2ba253 100644 --- a/collector/fixtures/e2e-output.txt +++ b/collector/fixtures/e2e-output.txt @@ -2644,6 +2644,7 @@ node_scrape_collector_success{collector="softnet"} 1 node_scrape_collector_success{collector="stat"} 1 node_scrape_collector_success{collector="textfile"} 1 node_scrape_collector_success{collector="thermal_zone"} 1 +node_scrape_collector_success{collector="udp_queues"} 1 node_scrape_collector_success{collector="vmstat"} 1 node_scrape_collector_success{collector="wifi"} 1 node_scrape_collector_success{collector="xfs"} 1 @@ -2734,6 +2735,10 @@ node_textfile_scrape_error 0 # HELP node_thermal_zone_temp Zone temperature in Celsius # TYPE node_thermal_zone_temp gauge node_thermal_zone_temp{type="cpu-thermal",zone="0"} 12.376 +# HELP node_udp_queues Number of allocated memory in the kernel for UDP datagrams in bytes. +# TYPE node_udp_queues gauge +node_udp_queues{ip="v4",queue="rx"} 0 +node_udp_queues{ip="v4",queue="tx"} 21 # HELP node_vmstat_oom_kill /proc/vmstat information field oom_kill. # TYPE node_vmstat_oom_kill untyped node_vmstat_oom_kill 0 diff --git a/collector/fixtures/proc/net/tcpstat b/collector/fixtures/proc/net/tcpstat index 8b3777a9..352c00bb 100644 --- a/collector/fixtures/proc/net/tcpstat +++ b/collector/fixtures/proc/net/tcpstat @@ -1,3 +1,3 @@ sl local_address rem_address st tx_queue rx_queue tr tm->when retrnsmt uid timeout inode - 0: 00000000:0016 00000000:0000 0A 00000000:00000000 00:00000000 00000000 0 0 2740 1 ffff88003d3af3c0 100 0 0 10 0 - 1: 0F02000A:0016 0202000A:8B6B 01 00000000:00000000 02:000AC99B 00000000 0 0 3652 4 ffff88003d3ae040 21 4 31 47 46 + 0: 00000000:0016 00000000:0000 0A 00000015:00000000 00:00000000 00000000 0 0 2740 1 ffff88003d3af3c0 100 0 0 10 0 + 1: 0F02000A:0016 0202000A:8B6B 01 00000015:00000001 02:000AC99B 00000000 0 0 3652 4 ffff88003d3ae040 21 4 31 47 46 diff --git a/collector/fixtures/proc/net/udp b/collector/fixtures/proc/net/udp new file mode 100644 index 00000000..3c505240 --- /dev/null +++ b/collector/fixtures/proc/net/udp @@ -0,0 +1,2 @@ + sl local_address rem_address st tx_queue rx_queue tr tm->when retrnsmt uid timeout inode + 0: 00000000:0016 00000000:0000 0A 00000015:00000000 00:00000000 00000000 0 0 2740 1 ffff88003d3af3c0 100 0 0 10 0 diff --git a/collector/tcpstat_linux.go b/collector/tcpstat_linux.go index 9b09e9ab..db9c655f 100644 --- a/collector/tcpstat_linux.go +++ b/collector/tcpstat_linux.go @@ -52,6 +52,10 @@ const ( tcpListen // TCP_CLOSING tcpClosing + // TCP_RX_BUFFER + tcpRxQueuedBytes + // TCP_TX_BUFFER + tcpTxQueuedBytes ) type tcpStatCollector struct { @@ -122,16 +126,34 @@ func parseTCPStats(r io.Reader) (map[tcpConnectionState]float64, error) { if len(parts) == 0 { continue } - if len(parts) < 4 { + if len(parts) < 5 { return nil, fmt.Errorf("invalid TCP stats line: %q", line) } + qu := strings.Split(parts[4], ":") + if len(qu) < 2 { + return nil, fmt.Errorf("cannot parse tx_queues and rx_queues: %q", line) + } + + tx, err := strconv.ParseUint(qu[0], 16, 64) + if err != nil { + return nil, err + } + tcpStats[tcpConnectionState(tcpTxQueuedBytes)] += float64(tx) + + rx, err := strconv.ParseUint(qu[1], 16, 64) + if err != nil { + return nil, err + } + tcpStats[tcpConnectionState(tcpRxQueuedBytes)] += float64(rx) + st, err := strconv.ParseInt(parts[3], 16, 8) if err != nil { return nil, err } tcpStats[tcpConnectionState(st)]++ + } return tcpStats, nil @@ -161,6 +183,10 @@ func (st tcpConnectionState) String() string { return "listen" case tcpClosing: return "closing" + case tcpRxQueuedBytes: + return "rx_queued_bytes" + case tcpTxQueuedBytes: + return "tx_queued_bytes" default: return "unknown" } diff --git a/collector/tcpstat_linux_test.go b/collector/tcpstat_linux_test.go index f4c3b36c..b609b846 100644 --- a/collector/tcpstat_linux_test.go +++ b/collector/tcpstat_linux_test.go @@ -28,8 +28,27 @@ func Test_parseTCPStatsError(t *testing.T) { name: "too few fields", in: "sl local_address\n 0: 00000000:0016", }, + { + name: "missing colon in tx-rx field", + in: "sl local_address rem_address st tx_queue rx_queue\n" + + " 1: 0F02000A:0016 0202000A:8B6B 01 0000000000000001", + }, + { + name: "tx parsing issue", + in: "sl local_address rem_address st tx_queue rx_queue\n" + + " 1: 0F02000A:0016 0202000A:8B6B 01 0000000x:00000001", + }, + { + name: "rx parsing issue", + in: "sl local_address rem_address st tx_queue rx_queue\n" + + " 1: 0F02000A:0016 0202000A:8B6B 01 00000000:0000000x", + }, + { + name: "state parsing issue", + in: "sl local_address rem_address st tx_queue rx_queue\n" + + " 1: 0F02000A:0016 0202000A:8B6B 0H 00000000:00000001", + }, } - for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { if _, err := parseTCPStats(strings.NewReader(tt.in)); err == nil { @@ -40,6 +59,14 @@ func Test_parseTCPStatsError(t *testing.T) { } func TestTCPStat(t *testing.T) { + + noFile, _ := os.Open("follow the white rabbit") + defer noFile.Close() + + if _, err := parseTCPStats(noFile); err == nil { + t.Fatal("expected an error, but none occurred") + } + file, err := os.Open("fixtures/proc/net/tcpstat") if err != nil { t.Fatal(err) @@ -58,4 +85,39 @@ func TestTCPStat(t *testing.T) { if want, got := 1, int(tcpStats[tcpListen]); want != got { t.Errorf("want tcpstat number of listen state %d, got %d", want, got) } + + if want, got := 42, int(tcpStats[tcpTxQueuedBytes]); want != got { + t.Errorf("want tcpstat number of bytes in tx queue %d, got %d", want, got) + } + if want, got := 1, int(tcpStats[tcpRxQueuedBytes]); want != got { + t.Errorf("want tcpstat number of bytes in rx queue %d, got %d", want, got) + } + +} + +func Test_getTCPStats(t *testing.T) { + type args struct { + statsFile string + } + tests := []struct { + name string + args args + wantErr bool + }{ + { + name: "file not found", + args: args{statsFile: "somewhere over the rainbow"}, + wantErr: true, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + _, err := getTCPStats(tt.args.statsFile) + if (err != nil) != tt.wantErr { + t.Errorf("getTCPStats() error = %v, wantErr %v", err, tt.wantErr) + return + } + // other cases are covered by TestTCPStat() + }) + } } diff --git a/collector/udp_queues_linux.go b/collector/udp_queues_linux.go new file mode 100644 index 00000000..512c0100 --- /dev/null +++ b/collector/udp_queues_linux.go @@ -0,0 +1,87 @@ +// Copyright 2015 The Prometheus Authors +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// +build !noudp_queues + +package collector + +import ( + "fmt" + "os" + + "github.com/go-kit/kit/log" + "github.com/go-kit/kit/log/level" + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/procfs" +) + +type ( + udpQueuesCollector struct { + fs procfs.FS + desc *prometheus.Desc + logger log.Logger + } +) + +func init() { + registerCollector("udp_queues", defaultEnabled, NewUDPqueuesCollector) +} + +// NewUDPqueuesCollector returns a new Collector exposing network udp queued bytes. +func NewUDPqueuesCollector(logger log.Logger) (Collector, error) { + fs, err := procfs.NewFS(*procPath) + if err != nil { + return nil, fmt.Errorf("failed to open procfs: %v", err) + } + return &udpQueuesCollector{ + fs: fs, + desc: prometheus.NewDesc( + prometheus.BuildFQName(namespace, "udp", "queues"), + "Number of allocated memory in the kernel for UDP datagrams in bytes.", + []string{"queue", "ip"}, nil, + ), + logger: logger, + }, nil +} + +func (c *udpQueuesCollector) Update(ch chan<- prometheus.Metric) error { + + s4, errIPv4 := c.fs.NetUDPSummary() + if errIPv4 == nil { + ch <- prometheus.MustNewConstMetric(c.desc, prometheus.GaugeValue, float64(s4.TxQueueLength), "tx", "v4") + ch <- prometheus.MustNewConstMetric(c.desc, prometheus.GaugeValue, float64(s4.RxQueueLength), "rx", "v4") + } else { + if os.IsNotExist(errIPv4) { + level.Debug(c.logger).Log("msg", "not collecting ipv4 based metrics") + } else { + return fmt.Errorf("couldn't get upd queued bytes: %s", errIPv4) + } + } + + s6, errIPv6 := c.fs.NetUDP6Summary() + if errIPv6 == nil { + ch <- prometheus.MustNewConstMetric(c.desc, prometheus.GaugeValue, float64(s6.TxQueueLength), "tx", "v6") + ch <- prometheus.MustNewConstMetric(c.desc, prometheus.GaugeValue, float64(s6.RxQueueLength), "rx", "v6") + } else { + if os.IsNotExist(errIPv6) { + level.Debug(c.logger).Log("msg", "not collecting ipv6 based metrics") + } else { + return fmt.Errorf("couldn't get upd6 queued bytes: %s", errIPv6) + } + } + + if os.IsNotExist(errIPv4) && os.IsNotExist(errIPv6) { + return ErrNoData + } + return nil +} diff --git a/end-to-end-test.sh b/end-to-end-test.sh index fb1520d4..961dd27e 100755 --- a/end-to-end-test.sh +++ b/end-to-end-test.sh @@ -38,6 +38,7 @@ enabled_collectors=$(cat << COLLECTORS thermal_zone textfile bonding + udp_queues vmstat wifi xfs