From bffd30b57af09ba01a6ea40378fefa3b37468efd Mon Sep 17 00:00:00 2001 From: Roman Khavronenko Date: Wed, 8 Nov 2023 00:53:07 -0600 Subject: [PATCH] app/vmalert: update remote-write process (#5284) * app/vmalert: update remote-write process * automatically retry remote-write requests on closed connections. The change should reduce the amount of logs produced in environments with short-living connections or environments without support of keep-alive on network balancers. * increment `vmalert_remotewrite_errors_total` metric if all retries to send remote-write request failed. Before, this metric was incremented only if remote-write client's buffer is overloaded. * increment `vmalert_remotewrite_dropped_rows_total` amd `vmalert_remotewrite_dropped_bytes_total` metrics if remote-write client's buffer is overloaded. Before, these metrics were incremented only after unsuccessful HTTP calls. Signed-off-by: hagen1778 * Update docs/CHANGELOG.md --------- Signed-off-by: hagen1778 Co-authored-by: Hui Wang --- app/vmalert/remotewrite/client.go | 21 +++++++++++++++++++-- app/vmalert/rule/group.go | 5 ----- docs/CHANGELOG.md | 3 +++ 3 files changed, 22 insertions(+), 7 deletions(-) diff --git a/app/vmalert/remotewrite/client.go b/app/vmalert/remotewrite/client.go index 525f6b459..6c012a350 100644 --- a/app/vmalert/remotewrite/client.go +++ b/app/vmalert/remotewrite/client.go @@ -3,6 +3,7 @@ package remotewrite import ( "bytes" "context" + "errors" "flag" "fmt" "io" @@ -117,12 +118,19 @@ func NewClient(ctx context.Context, cfg Config) (*Client, error) { // Push adds timeseries into queue for writing into remote storage. // Push returns and error if client is stopped or if queue is full. func (c *Client) Push(s prompbmarshal.TimeSeries) error { + rwTotal.Inc() select { case <-c.doneCh: + rwErrors.Inc() + droppedRows.Add(len(s.Samples)) + droppedBytes.Add(s.Size()) return fmt.Errorf("client is closed") case c.input <- s: return nil default: + rwErrors.Inc() + droppedRows.Add(len(s.Samples)) + droppedBytes.Add(s.Size()) return fmt.Errorf("failed to push timeseries - queue is full (%d entries). "+ "Queue size is controlled by -remoteWrite.maxQueueSize flag", c.maxQueueSize) @@ -181,11 +189,14 @@ func (c *Client) run(ctx context.Context) { } var ( + rwErrors = metrics.NewCounter(`vmalert_remotewrite_errors_total`) + rwTotal = metrics.NewCounter(`vmalert_remotewrite_total`) + sentRows = metrics.NewCounter(`vmalert_remotewrite_sent_rows_total`) sentBytes = metrics.NewCounter(`vmalert_remotewrite_sent_bytes_total`) - sendDuration = metrics.NewFloatCounter(`vmalert_remotewrite_send_duration_seconds_total`) droppedRows = metrics.NewCounter(`vmalert_remotewrite_dropped_rows_total`) droppedBytes = metrics.NewCounter(`vmalert_remotewrite_dropped_bytes_total`) + sendDuration = metrics.NewFloatCounter(`vmalert_remotewrite_send_duration_seconds_total`) bufferFlushDuration = metrics.NewHistogram(`vmalert_remotewrite_flush_duration_seconds`) _ = metrics.NewGauge(`vmalert_remotewrite_concurrency`, func() float64 { @@ -222,6 +233,11 @@ func (c *Client) flush(ctx context.Context, wr *prompbmarshal.WriteRequest) { L: for attempts := 0; ; attempts++ { err := c.send(ctx, b) + if errors.Is(err, io.EOF) { + // Something in the middle between client and destination might be closing + // the connection. So we do a one more attempt in hope request will succeed. + err = c.send(ctx, b) + } if err == nil { sentRows.Add(len(wr.Timeseries)) sentBytes.Add(len(b)) @@ -259,6 +275,7 @@ L: } + rwErrors.Inc() droppedRows.Add(len(wr.Timeseries)) droppedBytes.Add(len(b)) logger.Errorf("attempts to send remote-write request failed - dropping %d time series", @@ -303,7 +320,7 @@ func (c *Client) send(ctx context.Context, data []byte) error { // Prometheus remote Write compatible receivers MUST switch resp.StatusCode / 100 { case 2: - // respond with a HTTP 2xx status code when the write is successful. + // respond with HTTP 2xx status code when write is successful. return nil case 4: if resp.StatusCode != http.StatusTooManyRequests { diff --git a/app/vmalert/rule/group.go b/app/vmalert/rule/group.go index 36fb046a4..b635c1fa3 100644 --- a/app/vmalert/rule/group.go +++ b/app/vmalert/rule/group.go @@ -663,9 +663,6 @@ var ( execTotal = metrics.NewCounter(`vmalert_execution_total`) execErrors = metrics.NewCounter(`vmalert_execution_errors_total`) - - remoteWriteErrors = metrics.NewCounter(`vmalert_remotewrite_errors_total`) - remoteWriteTotal = metrics.NewCounter(`vmalert_remotewrite_total`) ) func (e *executor) exec(ctx context.Context, r Rule, ts time.Time, resolveDuration time.Duration, limit int) error { @@ -686,9 +683,7 @@ func (e *executor) exec(ctx context.Context, r Rule, ts time.Time, resolveDurati pushToRW := func(tss []prompbmarshal.TimeSeries) error { var lastErr error for _, ts := range tss { - remoteWriteTotal.Inc() if err := e.Rw.Push(ts); err != nil { - remoteWriteErrors.Inc() lastErr = fmt.Errorf("rule %q: remote write failure: %w", r, err) } } diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md index 3ad31d19f..bb02b0968 100644 --- a/docs/CHANGELOG.md +++ b/docs/CHANGELOG.md @@ -56,6 +56,7 @@ The sandbox cluster installation is running under the constant load generated by * FEATURE: [vmalert](https://docs.victoriametrics.com/vmalert.html): allow specifying full url in notifier static_configs target address, like `http://alertmanager:9093/test/api/v2/alerts`. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5184). * FEATURE: [vmalert](https://docs.victoriametrics.com/vmalert.html): reduce the number of queries for restoring alerts state on start-up. The change should speed up the restore process and reduce pressure on `remoteRead.url`. See [this pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5265). * FEATURE: [vmalert](https://docs.victoriametrics.com/vmalert.html): add label `file` pointing to the group's filename to metrics `vmalert_recording_.*` and `vmalert_alerts_.*`. The filename should help identifying alerting rules belonging to specific groups with identical names but different filenames. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5267). +* FEATURE: [vmalert](https://docs.victoriametrics.com/vmalert.html): automatically retry remote-write requests on closed connections. The change should reduce the amount of logs produced in environments with short-living connections or environments without support of keep-alive on network balancers. * FEATURE: [vmagent](https://docs.victoriametrics.com/vmagent.html): support data ingestion from [NewRelic infrastructure agent](https://docs.newrelic.com/docs/infrastructure/install-infrastructure-agent). See [these docs](https://docs.victoriametrics.com/Single-server-VictoriaMetrics.html#how-to-send-data-from-newrelic-agent), [this feature request](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3520) and [this pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/4712). * FEATURE: [vmagent](https://docs.victoriametrics.com/vmagent.html): add `-remoteWrite.shardByURL.labels` command-line flag, which can be used for specifying a list of labels for sharding outgoing samples among the configured `-remoteWrite.url` destinations if `-remoteWrite.shardByURL` command-line flag is set. See [these docs](https://docs.victoriametrics.com/vmagent.html#sharding-among-remote-storages) and [this feature request](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4942) for details. * FEATURE: [vmagent](https://docs.victoriametrics.com/vmagent.html): do not exit on startup when [scrape_configs](https://docs.victoriametrics.com/sd_configs.html#scrape_configs) refer to non-existing or invalid files with auth configs, since these files may appear / updated later. See [this feature request](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4959) and [this pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5153). @@ -82,6 +83,8 @@ The sandbox cluster installation is running under the constant load generated by * BUGFIX: [vmalert](https://docs.victoriametrics.com/vmalert.html): fix vmalert web UI when running on 32-bit architectures machine. * BUGFIX: [vmalert](https://docs.victoriametrics.com/vmalert.html): do not send requests to configured remote systems when `-datasource.*`, `-remoteWrite.*`, `-remoteRead.*` or `-notifier.*` command-line flags refer files with invalid auth configs. Previously such requests were sent without properly set auth headers. Now the requests are sent only after the files are updated with valid auth configs. See [this pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5153). * BUGFIX: [vmalert](https://docs.victoriametrics.com/vmalert.html): properly maintain alerts state in [replay mode](https://docs.victoriametrics.com/vmalert.html#rules-backfilling) if alert's `for` param was bigger than replay request range (usually a couple of hours). See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5186) for details. +* BUGFIX: [vmalert](https://docs.victoriametrics.com/vmalert.html): increment `vmalert_remotewrite_errors_total` metric if all retries to send remote-write request failed. Before, this metric was incremented only if remote-write client's buffer is overloaded. +* BUGFIX: [vmalert](https://docs.victoriametrics.com/vmalert.html): increment `vmalert_remotewrite_dropped_rows_total` and `vmalert_remotewrite_dropped_bytes_total` metrics if remote-write client's buffer is overloaded. Before, these metrics were incremented only after unsuccessful HTTP calls. * BUGFIX: `vmselect`: improve performance and memory usage during query processing on machines with big number of CPU cores. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5087). * BUGFIX: dashboards: fix vminsert/vmstorage/vmselect metrics filtering when dashboard is used to display data from many sub-clusters with unique job names. Before, only one specific job could have been accounted for component-specific panels, instead of all available jobs for the component. * BUGFIX: dashboards/vmalert: apply `desc` sorting in tooltips for vmalert dashboard in order to improve visibility of the outliers on graph.