Add gauges for allocated memory for queued UDP and TCP packages (#1503)

* Two new states will be added to the tcpstat collector called rx_queued_bytes and tx_queued_bytes.

For UDP datagrams an additional collector 'udp_queues' can be used to expose the total lengths of the tx_queue and rx_queue.
@SuperQ and @discordianfish this changes gives us the option to check for overloaded UDP + TCP processing.
The names of the new TCP states and the UDP metric can be discussed.
The current reasons are just:

I don't want to add another collector for the same exposed file, so I just added the new states to the tcpstat collector.
I chose the name 'udp_queue' instead of 'udpstat' as UDP has no state.


Signed-off-by: Peter Bueschel <peter.bueschel@logmein.com>
This commit is contained in:
Peter Bueschel 2020-03-31 10:46:32 +02:00 committed by GitHub
parent 4891b01b6c
commit da5972b539
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 189 additions and 4 deletions

View File

@ -41,6 +41,7 @@
* [FEATURE] Add Btrfs collector #1512 * [FEATURE] Add Btrfs collector #1512
* [FEATURE] Add RAPL collector #1523 * [FEATURE] Add RAPL collector #1523
* [FEATURE] Add new softnet collector #1576 * [FEATURE] Add new softnet collector #1576
* [FEATURE] Add new udp_queues collector #1503
* [ENHANCEMENT] Log pid when there is a problem reading the process stats #1341 * [ENHANCEMENT] Log pid when there is a problem reading the process stats #1341
* [ENHANCEMENT] Collect InfiniBand port state and physical state #1357 * [ENHANCEMENT] Collect InfiniBand port state and physical state #1357
* [ENHANCEMENT] Include additional XFS runtime statistics. #1423 * [ENHANCEMENT] Include additional XFS runtime statistics. #1423

View File

@ -60,6 +60,7 @@ textfile | Exposes statistics read from local disk. The `--collector.textfile.di
thermal\_zone | Exposes thermal zone & cooling device statistics from `/sys/class/thermal`. | Linux thermal\_zone | Exposes thermal zone & cooling device statistics from `/sys/class/thermal`. | Linux
time | Exposes the current system time. | _any_ time | Exposes the current system time. | _any_
timex | Exposes selected adjtimex(2) system call stats. | Linux timex | Exposes selected adjtimex(2) system call stats. | Linux
udp_queues | Exposes UDP total lengths of the rx_queue and tx_queue from `/proc/net/udp` and `/proc/net/udp6`. | Linux
uname | Exposes system information as provided by the uname system call. | Darwin, FreeBSD, Linux, OpenBSD uname | Exposes system information as provided by the uname system call. | Darwin, FreeBSD, Linux, OpenBSD
vmstat | Exposes statistics from `/proc/vmstat`. | Linux vmstat | Exposes statistics from `/proc/vmstat`. | Linux
xfs | Exposes XFS runtime statistics. | Linux (kernel 4.4+) xfs | Exposes XFS runtime statistics. | Linux (kernel 4.4+)

View File

@ -2644,6 +2644,7 @@ node_scrape_collector_success{collector="softnet"} 1
node_scrape_collector_success{collector="stat"} 1 node_scrape_collector_success{collector="stat"} 1
node_scrape_collector_success{collector="textfile"} 1 node_scrape_collector_success{collector="textfile"} 1
node_scrape_collector_success{collector="thermal_zone"} 1 node_scrape_collector_success{collector="thermal_zone"} 1
node_scrape_collector_success{collector="udp_queues"} 1
node_scrape_collector_success{collector="vmstat"} 1 node_scrape_collector_success{collector="vmstat"} 1
node_scrape_collector_success{collector="wifi"} 1 node_scrape_collector_success{collector="wifi"} 1
node_scrape_collector_success{collector="xfs"} 1 node_scrape_collector_success{collector="xfs"} 1
@ -2734,6 +2735,10 @@ node_textfile_scrape_error 0
# HELP node_thermal_zone_temp Zone temperature in Celsius # HELP node_thermal_zone_temp Zone temperature in Celsius
# TYPE node_thermal_zone_temp gauge # TYPE node_thermal_zone_temp gauge
node_thermal_zone_temp{type="cpu-thermal",zone="0"} 12.376 node_thermal_zone_temp{type="cpu-thermal",zone="0"} 12.376
# HELP node_udp_queues Number of allocated memory in the kernel for UDP datagrams in bytes.
# TYPE node_udp_queues gauge
node_udp_queues{ip="v4",queue="rx"} 0
node_udp_queues{ip="v4",queue="tx"} 21
# HELP node_vmstat_oom_kill /proc/vmstat information field oom_kill. # HELP node_vmstat_oom_kill /proc/vmstat information field oom_kill.
# TYPE node_vmstat_oom_kill untyped # TYPE node_vmstat_oom_kill untyped
node_vmstat_oom_kill 0 node_vmstat_oom_kill 0

View File

@ -1,3 +1,3 @@
sl local_address rem_address st tx_queue rx_queue tr tm->when retrnsmt uid timeout inode sl local_address rem_address st tx_queue rx_queue tr tm->when retrnsmt uid timeout inode
0: 00000000:0016 00000000:0000 0A 00000000:00000000 00:00000000 00000000 0 0 2740 1 ffff88003d3af3c0 100 0 0 10 0 0: 00000000:0016 00000000:0000 0A 00000015:00000000 00:00000000 00000000 0 0 2740 1 ffff88003d3af3c0 100 0 0 10 0
1: 0F02000A:0016 0202000A:8B6B 01 00000000:00000000 02:000AC99B 00000000 0 0 3652 4 ffff88003d3ae040 21 4 31 47 46 1: 0F02000A:0016 0202000A:8B6B 01 00000015:00000001 02:000AC99B 00000000 0 0 3652 4 ffff88003d3ae040 21 4 31 47 46

View File

@ -0,0 +1,2 @@
sl local_address rem_address st tx_queue rx_queue tr tm->when retrnsmt uid timeout inode
0: 00000000:0016 00000000:0000 0A 00000015:00000000 00:00000000 00000000 0 0 2740 1 ffff88003d3af3c0 100 0 0 10 0

View File

@ -52,6 +52,10 @@ const (
tcpListen tcpListen
// TCP_CLOSING // TCP_CLOSING
tcpClosing tcpClosing
// TCP_RX_BUFFER
tcpRxQueuedBytes
// TCP_TX_BUFFER
tcpTxQueuedBytes
) )
type tcpStatCollector struct { type tcpStatCollector struct {
@ -122,16 +126,34 @@ func parseTCPStats(r io.Reader) (map[tcpConnectionState]float64, error) {
if len(parts) == 0 { if len(parts) == 0 {
continue continue
} }
if len(parts) < 4 { if len(parts) < 5 {
return nil, fmt.Errorf("invalid TCP stats line: %q", line) return nil, fmt.Errorf("invalid TCP stats line: %q", line)
} }
qu := strings.Split(parts[4], ":")
if len(qu) < 2 {
return nil, fmt.Errorf("cannot parse tx_queues and rx_queues: %q", line)
}
tx, err := strconv.ParseUint(qu[0], 16, 64)
if err != nil {
return nil, err
}
tcpStats[tcpConnectionState(tcpTxQueuedBytes)] += float64(tx)
rx, err := strconv.ParseUint(qu[1], 16, 64)
if err != nil {
return nil, err
}
tcpStats[tcpConnectionState(tcpRxQueuedBytes)] += float64(rx)
st, err := strconv.ParseInt(parts[3], 16, 8) st, err := strconv.ParseInt(parts[3], 16, 8)
if err != nil { if err != nil {
return nil, err return nil, err
} }
tcpStats[tcpConnectionState(st)]++ tcpStats[tcpConnectionState(st)]++
} }
return tcpStats, nil return tcpStats, nil
@ -161,6 +183,10 @@ func (st tcpConnectionState) String() string {
return "listen" return "listen"
case tcpClosing: case tcpClosing:
return "closing" return "closing"
case tcpRxQueuedBytes:
return "rx_queued_bytes"
case tcpTxQueuedBytes:
return "tx_queued_bytes"
default: default:
return "unknown" return "unknown"
} }

View File

@ -28,8 +28,27 @@ func Test_parseTCPStatsError(t *testing.T) {
name: "too few fields", name: "too few fields",
in: "sl local_address\n 0: 00000000:0016", in: "sl local_address\n 0: 00000000:0016",
}, },
{
name: "missing colon in tx-rx field",
in: "sl local_address rem_address st tx_queue rx_queue\n" +
" 1: 0F02000A:0016 0202000A:8B6B 01 0000000000000001",
},
{
name: "tx parsing issue",
in: "sl local_address rem_address st tx_queue rx_queue\n" +
" 1: 0F02000A:0016 0202000A:8B6B 01 0000000x:00000001",
},
{
name: "rx parsing issue",
in: "sl local_address rem_address st tx_queue rx_queue\n" +
" 1: 0F02000A:0016 0202000A:8B6B 01 00000000:0000000x",
},
{
name: "state parsing issue",
in: "sl local_address rem_address st tx_queue rx_queue\n" +
" 1: 0F02000A:0016 0202000A:8B6B 0H 00000000:00000001",
},
} }
for _, tt := range tests { for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) { t.Run(tt.name, func(t *testing.T) {
if _, err := parseTCPStats(strings.NewReader(tt.in)); err == nil { if _, err := parseTCPStats(strings.NewReader(tt.in)); err == nil {
@ -40,6 +59,14 @@ func Test_parseTCPStatsError(t *testing.T) {
} }
func TestTCPStat(t *testing.T) { func TestTCPStat(t *testing.T) {
noFile, _ := os.Open("follow the white rabbit")
defer noFile.Close()
if _, err := parseTCPStats(noFile); err == nil {
t.Fatal("expected an error, but none occurred")
}
file, err := os.Open("fixtures/proc/net/tcpstat") file, err := os.Open("fixtures/proc/net/tcpstat")
if err != nil { if err != nil {
t.Fatal(err) t.Fatal(err)
@ -58,4 +85,39 @@ func TestTCPStat(t *testing.T) {
if want, got := 1, int(tcpStats[tcpListen]); want != got { if want, got := 1, int(tcpStats[tcpListen]); want != got {
t.Errorf("want tcpstat number of listen state %d, got %d", want, got) t.Errorf("want tcpstat number of listen state %d, got %d", want, got)
} }
if want, got := 42, int(tcpStats[tcpTxQueuedBytes]); want != got {
t.Errorf("want tcpstat number of bytes in tx queue %d, got %d", want, got)
}
if want, got := 1, int(tcpStats[tcpRxQueuedBytes]); want != got {
t.Errorf("want tcpstat number of bytes in rx queue %d, got %d", want, got)
}
}
func Test_getTCPStats(t *testing.T) {
type args struct {
statsFile string
}
tests := []struct {
name string
args args
wantErr bool
}{
{
name: "file not found",
args: args{statsFile: "somewhere over the rainbow"},
wantErr: true,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
_, err := getTCPStats(tt.args.statsFile)
if (err != nil) != tt.wantErr {
t.Errorf("getTCPStats() error = %v, wantErr %v", err, tt.wantErr)
return
}
// other cases are covered by TestTCPStat()
})
}
} }

View File

@ -0,0 +1,87 @@
// Copyright 2015 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// +build !noudp_queues
package collector
import (
"fmt"
"os"
"github.com/go-kit/kit/log"
"github.com/go-kit/kit/log/level"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/procfs"
)
type (
udpQueuesCollector struct {
fs procfs.FS
desc *prometheus.Desc
logger log.Logger
}
)
func init() {
registerCollector("udp_queues", defaultEnabled, NewUDPqueuesCollector)
}
// NewUDPqueuesCollector returns a new Collector exposing network udp queued bytes.
func NewUDPqueuesCollector(logger log.Logger) (Collector, error) {
fs, err := procfs.NewFS(*procPath)
if err != nil {
return nil, fmt.Errorf("failed to open procfs: %v", err)
}
return &udpQueuesCollector{
fs: fs,
desc: prometheus.NewDesc(
prometheus.BuildFQName(namespace, "udp", "queues"),
"Number of allocated memory in the kernel for UDP datagrams in bytes.",
[]string{"queue", "ip"}, nil,
),
logger: logger,
}, nil
}
func (c *udpQueuesCollector) Update(ch chan<- prometheus.Metric) error {
s4, errIPv4 := c.fs.NetUDPSummary()
if errIPv4 == nil {
ch <- prometheus.MustNewConstMetric(c.desc, prometheus.GaugeValue, float64(s4.TxQueueLength), "tx", "v4")
ch <- prometheus.MustNewConstMetric(c.desc, prometheus.GaugeValue, float64(s4.RxQueueLength), "rx", "v4")
} else {
if os.IsNotExist(errIPv4) {
level.Debug(c.logger).Log("msg", "not collecting ipv4 based metrics")
} else {
return fmt.Errorf("couldn't get upd queued bytes: %s", errIPv4)
}
}
s6, errIPv6 := c.fs.NetUDP6Summary()
if errIPv6 == nil {
ch <- prometheus.MustNewConstMetric(c.desc, prometheus.GaugeValue, float64(s6.TxQueueLength), "tx", "v6")
ch <- prometheus.MustNewConstMetric(c.desc, prometheus.GaugeValue, float64(s6.RxQueueLength), "rx", "v6")
} else {
if os.IsNotExist(errIPv6) {
level.Debug(c.logger).Log("msg", "not collecting ipv6 based metrics")
} else {
return fmt.Errorf("couldn't get upd6 queued bytes: %s", errIPv6)
}
}
if os.IsNotExist(errIPv4) && os.IsNotExist(errIPv6) {
return ErrNoData
}
return nil
}

View File

@ -38,6 +38,7 @@ enabled_collectors=$(cat << COLLECTORS
thermal_zone thermal_zone
textfile textfile
bonding bonding
udp_queues
vmstat vmstat
wifi wifi
xfs xfs