vmalert: add vmalert_remotewrite_sent_duration_seconds_total metric (#4517)

add `vmalert_remotewrite_sent_duration_seconds_total` metric
2024-11-23 20:37:12 +01:00 · 2023-06-26 13:34:51 +08:00 · 2023-06-26 13:34:51 +08:00 · a97887a2d9
commit a97887a2d9
parent 3846417df7
2 changed files with 7 additions and 0 deletions
--- a/app/vmalert/remotewrite/remotewrite.go
+++ b/app/vmalert/remotewrite/remotewrite.go
@ -183,9 +183,14 @@ func (c *Client) run(ctx context.Context) {
 var (
 	sentRows            = metrics.NewCounter(`vmalert_remotewrite_sent_rows_total`)
 	sentBytes           = metrics.NewCounter(`vmalert_remotewrite_sent_bytes_total`)
+	sendDuration        = metrics.NewFloatCounter(`vmalert_remotewrite_send_duration_seconds_total`)
 	droppedRows         = metrics.NewCounter(`vmalert_remotewrite_dropped_rows_total`)
 	droppedBytes        = metrics.NewCounter(`vmalert_remotewrite_dropped_bytes_total`)
 	bufferFlushDuration = metrics.NewHistogram(`vmalert_remotewrite_flush_duration_seconds`)
+
+	_ = metrics.NewGauge(`vmalert_remotewrite_concurrency`, func() float64 {
+		return float64(*concurrency)
+	})
 )

 // flush is a blocking function that marshals WriteRequest and sends
@ -211,6 +216,7 @@ func (c *Client) flush(ctx context.Context, wr *prompbmarshal.WriteRequest) {
 		retryInterval = maxRetryInterval
 	}
 	timeStart := time.Now()
+	defer sendDuration.Add(time.Since(timeStart).Seconds())
 L:
 	for attempts := 0; ; attempts++ {
 		err := c.send(ctx, b)
--- a/docs/CHANGELOG.md
+++ b/docs/CHANGELOG.md
@ -34,6 +34,7 @@ The following tip changes can be tested by building VictoriaMetrics components f
 * FEATURE: [Official Grafana dashboards for VictoriaMetrics](https://grafana.com/orgs/victoriametrics): add panel for tracking rate of syscalls while writing or reading from disk via `process_io_(read|write)_syscalls_total` metrics.
 * FEATURE: accept timestamps in milliseconds at `start`, `end` and `time` query args in [Prometheus querying API](https://docs.victoriametrics.com/#prometheus-querying-api-usage). See [these docs](https://docs.victoriametrics.com/#timestamp-formats) and [this feature request](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4459).
 * FEATURE: [vmalert](https://docs.victoriametrics.com/vmalert.html): update retry policy for pushing data to `-remoteWrite.url`. By default, vmalert will make multiple retry attempts with exponential delay. The total time spent during retry attempts shouldn't exceed `-remoteWrite.retryMaxTime` (default is 30s). When retry time is exceeded vmalert drops the data dedicated for `-remoteWrite.url`. Before, vmalert dropped data after 5 retry attempts with 1s delay between attempts (not configurable). See `-remoteWrite.retryMinInterval` and `-remoteWrite.retryMaxTime` cmd-line flags.
+* FEATURE: [vmalert](https://docs.victoriametrics.com/vmalert.html): expose `vmalert_remotewrite_send_duration_seconds_total` counter, which can be used for determining high saturation of every connection to remote storage with an alerting query `sum(rate(vmalert_remotewrite_send_duration_seconds_total[5m])) by(job, instance) > 0.9 * max(vmalert_remotewrite_concurrency) by(job, instance)`. This query triggers when a connection is saturated by more than 90%. This usually means that `-remoteWrite.concurrency` command-line flag must be increased in order to increase the number of concurrent writings into remote endpoint. See [this feature request](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4516).

 * BUGFIX: add the following command-line flags, which can be used for limiting Graphite API calls: 
  `--search.maxGraphiteTagKeys` for limiting the number of tag keys returned from Graphite `/tags`, `/tags/autoComplete/*`, `/tags/findSeries` API.