app/vmalert: update remote-write process (#5284)

* app/vmalert: update remote-write process * automatically retry remote-write requests on closed connections. The change should reduce the amount of logs produced in environments with short-living connections or environments without support of keep-alive on network balancers. * increment `vmalert_remotewrite_errors_total` metric if all retries to send remote-write request failed. Before, this metric was incremented only if remote-write client's buffer is overloaded. * increment `vmalert_remotewrite_dropped_rows_total` amd `vmalert_remotewrite_dropped_bytes_total` metrics if remote-write client's buffer is overloaded. Before, these metrics were incremented only after unsuccessful HTTP calls. Signed-off-by: hagen1778 <roman@victoriametrics.com> * Update docs/CHANGELOG.md --------- Signed-off-by: hagen1778 <roman@victoriametrics.com> Co-authored-by: Hui Wang <haley@victoriametrics.com>
2024-12-03 16:21:14 +01:00 · 2023-11-08 00:53:07 -06:00 · 2023-11-08 00:53:07 -06:00 · bffd30b57a
commit bffd30b57a
parent 5afc6a5765
3 changed files with 22 additions and 7 deletions
--- a/app/vmalert/remotewrite/client.go
+++ b/app/vmalert/remotewrite/client.go
@ -3,6 +3,7 @@ package remotewrite
 import (
 	"bytes"
 	"context"
 	"errors"
 	"flag"
 	"fmt"
 	"io"
@ -117,12 +118,19 @@ func NewClient(ctx context.Context, cfg Config) (*Client, error) {
 // Push adds timeseries into queue for writing into remote storage.
 // Push returns and error if client is stopped or if queue is full.
 func (c *Client) Push(s prompbmarshal.TimeSeries) error {
 	rwTotal.Inc()
 	select {
 	case <-c.doneCh:
 		rwErrors.Inc()
 		droppedRows.Add(len(s.Samples))
 		droppedBytes.Add(s.Size())
 		return fmt.Errorf("client is closed")
 	case c.input <- s:
 		return nil
 	default:
 		rwErrors.Inc()
 		droppedRows.Add(len(s.Samples))
 		droppedBytes.Add(s.Size())
 		return fmt.Errorf("failed to push timeseries - queue is full (%d entries). "+
 			"Queue size is controlled by -remoteWrite.maxQueueSize flag",
 			c.maxQueueSize)
@ -181,11 +189,14 @@ func (c *Client) run(ctx context.Context) {
 }
 var (
 	rwErrors = metrics.NewCounter(`vmalert_remotewrite_errors_total`)
 	rwTotal  = metrics.NewCounter(`vmalert_remotewrite_total`)
 	sentRows            = metrics.NewCounter(`vmalert_remotewrite_sent_rows_total`)
 	sentBytes           = metrics.NewCounter(`vmalert_remotewrite_sent_bytes_total`)
 	sendDuration        = metrics.NewFloatCounter(`vmalert_remotewrite_send_duration_seconds_total`)
 	droppedRows         = metrics.NewCounter(`vmalert_remotewrite_dropped_rows_total`)
 	droppedBytes        = metrics.NewCounter(`vmalert_remotewrite_dropped_bytes_total`)
 	sendDuration        = metrics.NewFloatCounter(`vmalert_remotewrite_send_duration_seconds_total`)
 	bufferFlushDuration = metrics.NewHistogram(`vmalert_remotewrite_flush_duration_seconds`)
 	_ = metrics.NewGauge(`vmalert_remotewrite_concurrency`, func() float64 {
@ -222,6 +233,11 @@ func (c *Client) flush(ctx context.Context, wr *prompbmarshal.WriteRequest) {
 L:
 	for attempts := 0; ; attempts++ {
 		err := c.send(ctx, b)
 		if errors.Is(err, io.EOF) {
 			// Something in the middle between client and destination might be closing
 			// the connection. So we do a one more attempt in hope request will succeed.
 			err = c.send(ctx, b)
 		}
 		if err == nil {
 			sentRows.Add(len(wr.Timeseries))
 			sentBytes.Add(len(b))
@ -259,6 +275,7 @@ L:
 	}
 	rwErrors.Inc()
 	droppedRows.Add(len(wr.Timeseries))
 	droppedBytes.Add(len(b))
 	logger.Errorf("attempts to send remote-write request failed - dropping %d time series",
@ -303,7 +320,7 @@ func (c *Client) send(ctx context.Context, data []byte) error {
 	// Prometheus remote Write compatible receivers MUST
 	switch resp.StatusCode / 100 {
 	case 2:
-		// respond with a HTTP 2xx status code when the write is successful.
+		// respond with HTTP 2xx status code when write is successful.
 		return nil
 	case 4:
 		if resp.StatusCode != http.StatusTooManyRequests {
--- a/app/vmalert/rule/group.go
+++ b/app/vmalert/rule/group.go
@ -663,9 +663,6 @@ var (
 	execTotal  = metrics.NewCounter(`vmalert_execution_total`)
 	execErrors = metrics.NewCounter(`vmalert_execution_errors_total`)
 	remoteWriteErrors = metrics.NewCounter(`vmalert_remotewrite_errors_total`)
 	remoteWriteTotal  = metrics.NewCounter(`vmalert_remotewrite_total`)
 )
 func (e *executor) exec(ctx context.Context, r Rule, ts time.Time, resolveDuration time.Duration, limit int) error {
@ -686,9 +683,7 @@ func (e *executor) exec(ctx context.Context, r Rule, ts time.Time, resolveDurati
 		pushToRW := func(tss []prompbmarshal.TimeSeries) error {
 			var lastErr error
 			for _, ts := range tss {
 				remoteWriteTotal.Inc()
 				if err := e.Rw.Push(ts); err != nil {
 					remoteWriteErrors.Inc()
 					lastErr = fmt.Errorf("rule %q: remote write failure: %w", r, err)
 				}
 			}
--- a/docs/CHANGELOG.md
+++ b/docs/CHANGELOG.md
@ -56,6 +56,7 @@ The sandbox cluster installation is running under the constant load generated by
 * FEATURE: [vmalert](https://docs.victoriametrics.com/vmalert.html): allow specifying full url in notifier static_configs target address, like `http://alertmanager:9093/test/api/v2/alerts`. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5184).
 * FEATURE: [vmalert](https://docs.victoriametrics.com/vmalert.html): reduce the number of queries for restoring alerts state on start-up. The change should speed up the restore process and reduce pressure on `remoteRead.url`. See [this pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5265).
 * FEATURE: [vmalert](https://docs.victoriametrics.com/vmalert.html): add label `file` pointing to the group's filename to metrics `vmalert_recording_.*` and `vmalert_alerts_.*`. The filename should help identifying alerting rules belonging to specific groups with identical names but different filenames. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5267).
 * FEATURE: [vmalert](https://docs.victoriametrics.com/vmalert.html): automatically retry remote-write requests on closed connections. The change should reduce the amount of logs produced in environments with short-living connections or environments without support of keep-alive on network balancers.  
 * FEATURE: [vmagent](https://docs.victoriametrics.com/vmagent.html): support data ingestion from [NewRelic infrastructure agent](https://docs.newrelic.com/docs/infrastructure/install-infrastructure-agent). See [these docs](https://docs.victoriametrics.com/Single-server-VictoriaMetrics.html#how-to-send-data-from-newrelic-agent), [this feature request](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3520) and [this pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/4712).
 * FEATURE: [vmagent](https://docs.victoriametrics.com/vmagent.html): add `-remoteWrite.shardByURL.labels` command-line flag, which can be used for specifying a list of labels for sharding outgoing samples among the configured `-remoteWrite.url` destinations if `-remoteWrite.shardByURL` command-line flag is set. See [these docs](https://docs.victoriametrics.com/vmagent.html#sharding-among-remote-storages) and [this feature request](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4942) for details.
 * FEATURE: [vmagent](https://docs.victoriametrics.com/vmagent.html): do not exit on startup when [scrape_configs](https://docs.victoriametrics.com/sd_configs.html#scrape_configs) refer to non-existing or invalid files with auth configs, since these files may appear / updated later. See [this feature request](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4959) and [this pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5153).
@ -82,6 +83,8 @@ The sandbox cluster installation is running under the constant load generated by
 * BUGFIX: [vmalert](https://docs.victoriametrics.com/vmalert.html): fix vmalert web UI when running on 32-bit architectures machine.
 * BUGFIX: [vmalert](https://docs.victoriametrics.com/vmalert.html): do not send requests to configured remote systems when `-datasource.*`, `-remoteWrite.*`, `-remoteRead.*` or `-notifier.*` command-line flags refer files with invalid auth configs. Previously such requests were sent without properly set auth headers. Now the requests are sent only after the files are updated with valid auth configs. See [this pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5153).
 * BUGFIX: [vmalert](https://docs.victoriametrics.com/vmalert.html): properly maintain alerts state in [replay mode](https://docs.victoriametrics.com/vmalert.html#rules-backfilling) if alert's `for` param was bigger than replay request range (usually a couple of hours). See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5186) for details.
 * BUGFIX: [vmalert](https://docs.victoriametrics.com/vmalert.html): increment `vmalert_remotewrite_errors_total` metric if all retries to send remote-write request failed. Before, this metric was incremented only if remote-write client's buffer is overloaded.
 * BUGFIX: [vmalert](https://docs.victoriametrics.com/vmalert.html): increment `vmalert_remotewrite_dropped_rows_total` and `vmalert_remotewrite_dropped_bytes_total` metrics if remote-write client's buffer is overloaded. Before, these metrics were incremented only after unsuccessful HTTP calls.
 * BUGFIX: `vmselect`: improve performance and memory usage during query processing on machines with big number of CPU cores. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5087).
 * BUGFIX: dashboards: fix vminsert/vmstorage/vmselect metrics filtering when dashboard is used to display data from many sub-clusters with unique job names. Before, only one specific job could have been accounted for component-specific panels, instead of all available jobs for the component.
 * BUGFIX: dashboards/vmalert: apply `desc` sorting in tooltips for vmalert dashboard in order to improve visibility of the outliers on graph.