From 5ea6d71cb38db86f03e32b06daf16b286e6af76c Mon Sep 17 00:00:00 2001
From: Zakhar Bessarab <z.bessarab@victoriametrics.com>
Date: Fri, 24 Feb 2023 23:39:56 +0400
Subject: [PATCH] fix: do not use exponential backoff for first retry of scrape
 request (#3824)

* fix: do not use exponential backoff for first retry of scrape request (#3293)

* lib/promscrape: refactor `doRequestWithPossibleRetry` backoff to simplify logic

Signed-off-by: Zakhar Bessarab <z.bessarab@victoriametrics.com>

* Update lib/promscrape/client.go

Co-authored-by: Roman Khavronenko <roman@victoriametrics.com>

* lib/promscrape: refactor `doRequestWithPossibleRetry` to make it more straightforward

Signed-off-by: Zakhar Bessarab <z.bessarab@victoriametrics.com>

---------

Signed-off-by: Zakhar Bessarab <z.bessarab@victoriametrics.com>
Co-authored-by: Roman Khavronenko <roman@victoriametrics.com>
---
 docs/CHANGELOG.md                       |  1 +
 lib/promscrape/client.go                | 35 ++++++++++++++------
 lib/promscrape/discoveryutils/client.go | 43 ++++++++++++++++++-------
 3 files changed, 57 insertions(+), 22 deletions(-)

diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md
index bc5a6ee641..177d743f1d 100644
--- a/docs/CHANGELOG.md
+++ b/docs/CHANGELOG.md
@@ -42,6 +42,7 @@ The following tip changes can be tested by building VictoriaMetrics components f
 * BUGFIX: prevent from possible data ingestion slowdown and query performance slowdown during [background merges of big parts](https://docs.victoriametrics.com/#storage) on systems with small number of CPU cores (1 or 2 CPU cores). The issue has been introduced in [v1.85.0](https://docs.victoriametrics.com/CHANGELOG.html#v1850) when implementing [this feature](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3337). See also [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3790).
 * BUGFIX: properly parse timestamps in milliseconds when [ingesting data via OpenTSDB telnet put protocol](https://docs.victoriametrics.com/#sending-data-via-telnet-put-protocol). Previously timestamps in milliseconds were mistakenly multiplied by 1000. Thanks to @Droxenator for the [pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/3810).
 * BUGFIX: [MetricsQL](https://docs.victoriametrics.com/MetricsQL.html): do not add extrapolated points outside the real points when using [interpolate()](https://docs.victoriametrics.com/MetricsQL.html#interpolate) function. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3816).
+* BUGFIX: do not use exponential backoff for first retry of scrape request. Previously the first retry of scrape request was delayed by 2 seconds, which could lead to increased delays in case of connection being closed by short keep-alive timeout. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3293). 
 
 ## [v1.87.1](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.87.1)
 
diff --git a/lib/promscrape/client.go b/lib/promscrape/client.go
index c74fea462e..d270368abc 100644
--- a/lib/promscrape/client.go
+++ b/lib/promscrape/client.go
@@ -351,33 +351,48 @@ var (
 )
 
 func doRequestWithPossibleRetry(ctx context.Context, hc *fasthttp.HostClient, req *fasthttp.Request, resp *fasthttp.Response, deadline time.Time) error {
-	sleepTime := time.Second
 	scrapeRequests.Inc()
 	reqCtx, cancel := context.WithDeadline(ctx, deadline)
 	defer cancel()
-	for {
+
+	var reqErr error
+	// Return true if the request execution is completed and retry is not required
+	attempt := func() bool {
 		// Use DoCtx instead of Do in order to support context cancellation
-		err := hc.DoCtx(reqCtx, req, resp)
-		if err == nil {
+		reqErr = hc.DoCtx(reqCtx, req, resp)
+		if reqErr == nil {
 			statusCode := resp.StatusCode()
 			if statusCode != fasthttp.StatusTooManyRequests {
-				return nil
+				return true
 			}
-		} else if err != fasthttp.ErrConnectionClosed && !strings.Contains(err.Error(), "broken pipe") {
-			return err
+		} else if reqErr != fasthttp.ErrConnectionClosed && !strings.Contains(reqErr.Error(), "broken pipe") {
+			return true
+		}
+
+		return false
+	}
+
+	if attempt() {
+		return reqErr
+	}
+
+	sleepTime := time.Second
+	maxSleepTime := time.Until(deadline)
+	for {
+		scrapeRetries.Inc()
+		if attempt() {
+			return reqErr
 		}
 
 		// Retry request after exponentially increased sleep.
-		maxSleepTime := time.Until(deadline)
 		if sleepTime > maxSleepTime {
-			return fmt.Errorf("the server closes all the connection attempts: %w", err)
+			return fmt.Errorf("the server closes all the connection attempts: %w", reqErr)
 		}
 		sleepTime += sleepTime
 		if sleepTime > maxSleepTime {
 			sleepTime = maxSleepTime
 		}
 		time.Sleep(sleepTime)
-		scrapeRetries.Inc()
 	}
 }
 
diff --git a/lib/promscrape/discoveryutils/client.go b/lib/promscrape/discoveryutils/client.go
index 772339dc9c..d276bec3ad 100644
--- a/lib/promscrape/discoveryutils/client.go
+++ b/lib/promscrape/discoveryutils/client.go
@@ -16,6 +16,7 @@ import (
 	"github.com/VictoriaMetrics/VictoriaMetrics/lib/promauth"
 	"github.com/VictoriaMetrics/VictoriaMetrics/lib/proxy"
 	"github.com/VictoriaMetrics/VictoriaMetrics/lib/timerpool"
+	"github.com/VictoriaMetrics/fasthttp"
 	"github.com/VictoriaMetrics/metrics"
 )
 
@@ -270,30 +271,48 @@ func (c *Client) Stop() {
 }
 
 func doRequestWithPossibleRetry(hc *HTTPClient, req *http.Request, deadline time.Time) (*http.Response, error) {
-	sleepTime := time.Second
 	discoveryRequests.Inc()
 
-	for {
-		resp, err := hc.client.Do(req)
-		if err == nil {
+	var (
+		reqErr error
+		resp   *http.Response
+	)
+	// Return true if the request execution is completed and retry is not required
+	attempt := func() bool {
+		// Use DoCtx instead of Do in order to support context cancellation
+		resp, reqErr = hc.client.Do(req)
+		if reqErr == nil {
 			statusCode := resp.StatusCode
-			if statusCode != http.StatusTooManyRequests {
-				return resp, nil
+			if statusCode != fasthttp.StatusTooManyRequests {
+				return true
 			}
-		} else if err != net.ErrClosed && !strings.Contains(err.Error(), "broken pipe") {
-			return nil, err
+		} else if reqErr != fasthttp.ErrConnectionClosed && !strings.Contains(reqErr.Error(), "broken pipe") {
+			return true
 		}
-		// Retry request after exponentially increased sleep.
-		maxSleepTime := time.Until(deadline)
+
+		return false
+	}
+
+	if attempt() {
+		return resp, reqErr
+	}
+
+	sleepTime := time.Second
+	maxSleepTime := time.Until(deadline)
+	for {
+		discoveryRetries.Inc()
+		if attempt() {
+			return resp, reqErr
+		}
+
 		if sleepTime > maxSleepTime {
-			return nil, fmt.Errorf("the server closes all the connection attempts: %w", err)
+			return nil, fmt.Errorf("the server closes all the connection attempts: %w", reqErr)
 		}
 		sleepTime += sleepTime
 		if sleepTime > maxSleepTime {
 			sleepTime = maxSleepTime
 		}
 		time.Sleep(sleepTime)
-		discoveryRetries.Inc()
 	}
 }