From 5ea6d71cb38db86f03e32b06daf16b286e6af76c Mon Sep 17 00:00:00 2001 From: Zakhar Bessarab Date: Fri, 24 Feb 2023 23:39:56 +0400 Subject: [PATCH] fix: do not use exponential backoff for first retry of scrape request (#3824) * fix: do not use exponential backoff for first retry of scrape request (#3293) * lib/promscrape: refactor `doRequestWithPossibleRetry` backoff to simplify logic Signed-off-by: Zakhar Bessarab * Update lib/promscrape/client.go Co-authored-by: Roman Khavronenko * lib/promscrape: refactor `doRequestWithPossibleRetry` to make it more straightforward Signed-off-by: Zakhar Bessarab --------- Signed-off-by: Zakhar Bessarab Co-authored-by: Roman Khavronenko --- docs/CHANGELOG.md | 1 + lib/promscrape/client.go | 35 ++++++++++++++------ lib/promscrape/discoveryutils/client.go | 43 ++++++++++++++++++------- 3 files changed, 57 insertions(+), 22 deletions(-) diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md index bc5a6ee641..177d743f1d 100644 --- a/docs/CHANGELOG.md +++ b/docs/CHANGELOG.md @@ -42,6 +42,7 @@ The following tip changes can be tested by building VictoriaMetrics components f * BUGFIX: prevent from possible data ingestion slowdown and query performance slowdown during [background merges of big parts](https://docs.victoriametrics.com/#storage) on systems with small number of CPU cores (1 or 2 CPU cores). The issue has been introduced in [v1.85.0](https://docs.victoriametrics.com/CHANGELOG.html#v1850) when implementing [this feature](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3337). See also [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3790). * BUGFIX: properly parse timestamps in milliseconds when [ingesting data via OpenTSDB telnet put protocol](https://docs.victoriametrics.com/#sending-data-via-telnet-put-protocol). Previously timestamps in milliseconds were mistakenly multiplied by 1000. Thanks to @Droxenator for the [pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/3810). * BUGFIX: [MetricsQL](https://docs.victoriametrics.com/MetricsQL.html): do not add extrapolated points outside the real points when using [interpolate()](https://docs.victoriametrics.com/MetricsQL.html#interpolate) function. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3816). +* BUGFIX: do not use exponential backoff for first retry of scrape request. Previously the first retry of scrape request was delayed by 2 seconds, which could lead to increased delays in case of connection being closed by short keep-alive timeout. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3293). ## [v1.87.1](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.87.1) diff --git a/lib/promscrape/client.go b/lib/promscrape/client.go index c74fea462e..d270368abc 100644 --- a/lib/promscrape/client.go +++ b/lib/promscrape/client.go @@ -351,33 +351,48 @@ var ( ) func doRequestWithPossibleRetry(ctx context.Context, hc *fasthttp.HostClient, req *fasthttp.Request, resp *fasthttp.Response, deadline time.Time) error { - sleepTime := time.Second scrapeRequests.Inc() reqCtx, cancel := context.WithDeadline(ctx, deadline) defer cancel() - for { + + var reqErr error + // Return true if the request execution is completed and retry is not required + attempt := func() bool { // Use DoCtx instead of Do in order to support context cancellation - err := hc.DoCtx(reqCtx, req, resp) - if err == nil { + reqErr = hc.DoCtx(reqCtx, req, resp) + if reqErr == nil { statusCode := resp.StatusCode() if statusCode != fasthttp.StatusTooManyRequests { - return nil + return true } - } else if err != fasthttp.ErrConnectionClosed && !strings.Contains(err.Error(), "broken pipe") { - return err + } else if reqErr != fasthttp.ErrConnectionClosed && !strings.Contains(reqErr.Error(), "broken pipe") { + return true + } + + return false + } + + if attempt() { + return reqErr + } + + sleepTime := time.Second + maxSleepTime := time.Until(deadline) + for { + scrapeRetries.Inc() + if attempt() { + return reqErr } // Retry request after exponentially increased sleep. - maxSleepTime := time.Until(deadline) if sleepTime > maxSleepTime { - return fmt.Errorf("the server closes all the connection attempts: %w", err) + return fmt.Errorf("the server closes all the connection attempts: %w", reqErr) } sleepTime += sleepTime if sleepTime > maxSleepTime { sleepTime = maxSleepTime } time.Sleep(sleepTime) - scrapeRetries.Inc() } } diff --git a/lib/promscrape/discoveryutils/client.go b/lib/promscrape/discoveryutils/client.go index 772339dc9c..d276bec3ad 100644 --- a/lib/promscrape/discoveryutils/client.go +++ b/lib/promscrape/discoveryutils/client.go @@ -16,6 +16,7 @@ import ( "github.com/VictoriaMetrics/VictoriaMetrics/lib/promauth" "github.com/VictoriaMetrics/VictoriaMetrics/lib/proxy" "github.com/VictoriaMetrics/VictoriaMetrics/lib/timerpool" + "github.com/VictoriaMetrics/fasthttp" "github.com/VictoriaMetrics/metrics" ) @@ -270,30 +271,48 @@ func (c *Client) Stop() { } func doRequestWithPossibleRetry(hc *HTTPClient, req *http.Request, deadline time.Time) (*http.Response, error) { - sleepTime := time.Second discoveryRequests.Inc() - for { - resp, err := hc.client.Do(req) - if err == nil { + var ( + reqErr error + resp *http.Response + ) + // Return true if the request execution is completed and retry is not required + attempt := func() bool { + // Use DoCtx instead of Do in order to support context cancellation + resp, reqErr = hc.client.Do(req) + if reqErr == nil { statusCode := resp.StatusCode - if statusCode != http.StatusTooManyRequests { - return resp, nil + if statusCode != fasthttp.StatusTooManyRequests { + return true } - } else if err != net.ErrClosed && !strings.Contains(err.Error(), "broken pipe") { - return nil, err + } else if reqErr != fasthttp.ErrConnectionClosed && !strings.Contains(reqErr.Error(), "broken pipe") { + return true } - // Retry request after exponentially increased sleep. - maxSleepTime := time.Until(deadline) + + return false + } + + if attempt() { + return resp, reqErr + } + + sleepTime := time.Second + maxSleepTime := time.Until(deadline) + for { + discoveryRetries.Inc() + if attempt() { + return resp, reqErr + } + if sleepTime > maxSleepTime { - return nil, fmt.Errorf("the server closes all the connection attempts: %w", err) + return nil, fmt.Errorf("the server closes all the connection attempts: %w", reqErr) } sleepTime += sleepTime if sleepTime > maxSleepTime { sleepTime = maxSleepTime } time.Sleep(sleepTime) - discoveryRetries.Inc() } }