vmalert: update retry policy for pushing data to -remoteWrite.url (#4504)

By default, vmalert will make multiple retry attempts with exponential delay.
The total time spent during retry attempts shouldn't exceed `-remoteWrite.retryMaxTime` (default is 30s).
When retry time is exceeded vmalert drops the data dedicated for `-remoteWrite.url`.
Before, vmalert dropped data after 5 retry attempts with 1s delay between attempts (not configurable).

See `-remoteWrite.retryMinInterval` and `-remoteWrite.retryMaxTime` cmd-line flags.

Signed-off-by: hagen1778 <roman@victoriametrics.com>
Co-authored-by: Nikolay <nik@victoriametrics.com>
This commit is contained in:
Roman Khavronenko 2023-06-22 15:14:23 +02:00 committed by GitHub
parent 4aad7a43df
commit 5f9ad22884
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 35 additions and 12 deletions

View File

@ -1180,6 +1180,10 @@ The shortlist of configuration flags is the following:
Optional OAuth2 scopes to use for -notifier.url. Scopes must be delimited by ';'.
-remoteWrite.oauth2.tokenUrl string
Optional OAuth2 tokenURL to use for -notifier.url.
-remoteWrite.retryMaxTime duration
The max time spent on retry attempts for the failed remote-write request. Change this value if it is expected for remoteWrite.url to be unreachable for more than -remoteWrite.retryMaxTime. See also -remoteWrite.retryMinInterval (default 30s)
-remoteWrite.retryMinInterval duration
The minimum delay between retry attempts. Every next retry attempt will double the delay to prevent hammering of remote database. See also -remoteWrite.retryMaxInterval (default 1s)
-remoteWrite.sendTimeout duration
Timeout for sending data to the configured -remoteWrite.url. (default 30s)
-remoteWrite.showURL

View File

@ -23,6 +23,8 @@ import (
var (
disablePathAppend = flag.Bool("remoteWrite.disablePathAppend", false, "Whether to disable automatic appending of '/api/v1/write' path to the configured -remoteWrite.url.")
sendTimeout = flag.Duration("remoteWrite.sendTimeout", 30*time.Second, "Timeout for sending data to the configured -remoteWrite.url.")
retryMinInterval = flag.Duration("remoteWrite.retryMinInterval", time.Second, "The minimum delay between retry attempts. Every next retry attempt will double the delay to prevent hammering of remote database. See also -remoteWrite.retryMaxInterval")
retryMaxTime = flag.Duration("remoteWrite.retryMaxTime", time.Second*30, "The max time spent on retry attempts for the failed remote-write request. Change this value if it is expected for remoteWrite.url to be unreachable for more than -remoteWrite.retryMaxTime. See also -remoteWrite.retryMinInterval")
)
// Client is an asynchronous HTTP client for writing
@ -186,11 +188,6 @@ var (
bufferFlushDuration = metrics.NewHistogram(`vmalert_remotewrite_flush_duration_seconds`)
)
var (
retryCount = 5
retryBackoff = time.Second
)
// flush is a blocking function that marshals WriteRequest and sends
// it to remote-write endpoint. Flush performs limited amount of retries
// if request fails.
@ -208,8 +205,14 @@ func (c *Client) flush(ctx context.Context, wr *prompbmarshal.WriteRequest) {
}
b := snappy.Encode(nil, data)
L:
for attempts := 0; attempts < retryCount; attempts++ {
retryInterval, maxRetryInterval := *retryMinInterval, *retryMaxTime
if retryInterval > maxRetryInterval {
retryInterval = maxRetryInterval
}
timeStart := time.Now()
L:
for attempts := 0; ; attempts++ {
err := c.send(ctx, b)
if err == nil {
sentRows.Add(len(wr.Timeseries))
@ -233,8 +236,19 @@ L:
default:
}
// sleeping to avoid remote db hammering
time.Sleep(retryBackoff)
timeLeftForRetries := maxRetryInterval - time.Since(timeStart)
if timeLeftForRetries < 0 {
// the max retry time has passed, so we give up
break
}
if retryInterval > timeLeftForRetries {
retryInterval = timeLeftForRetries
}
// sleeping to prevent remote db hammering
time.Sleep(retryInterval)
retryInterval *= 2
}
droppedRows.Add(len(wr.Timeseries))

View File

@ -19,10 +19,10 @@ import (
)
func TestClient_Push(t *testing.T) {
oldRetryBackoff := retryBackoff
retryBackoff = time.Millisecond * 10
oldMinInterval := *retryMinInterval
*retryMinInterval = time.Millisecond * 10
defer func() {
retryBackoff = oldRetryBackoff
*retryMinInterval = oldMinInterval
}()
testSrv := newRWServer()

View File

@ -33,6 +33,7 @@ The following tip changes can be tested by building VictoriaMetrics components f
* FEATURE: vmstorage: suppress "broken pipe" and "connection reset by peer" errors for search queries on vmstorage side. See [this](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/4418/commits/a6a7795b9e1f210d614a2c5f9a3016b97ded4792) and [this](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/4498/commits/830dac177f0f09032165c248943a5da0e10dfe90) commits.
* FEATURE: [Official Grafana dashboards for VictoriaMetrics](https://grafana.com/orgs/victoriametrics): add panel for tracking rate of syscalls while writing or reading from disk via `process_io_(read|write)_syscalls_total` metrics.
* FEATURE: accept timestamps in milliseconds at `start`, `end` and `time` query args in [Prometheus querying API](https://docs.victoriametrics.com/#prometheus-querying-api-usage). See [these docs](https://docs.victoriametrics.com/#timestamp-formats) and [this feature request](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4459).
* FEATURE: [vmalert](https://docs.victoriametrics.com/vmalert.html): update retry policy for pushing data to `-remoteWrite.url`. By default, vmalert will make multiple retry attempts with exponential delay. The total time spent during retry attempts shouldn't exceed `-remoteWrite.retryMaxTime` (default is 30s). When retry time is exceeded vmalert drops the data dedicated for `-remoteWrite.url`. Before, vmalert dropped data after 5 retry attempts with 1s delay between attempts (not configurable). See `-remoteWrite.retryMinInterval` and `-remoteWrite.retryMaxTime` cmd-line flags.
* BUGFIX: add the following command-line flags, which can be used for limiting Graphite API calls:
`--search.maxGraphiteTagKeys` for limiting the number of tag keys returned from Graphite `/tags`, `/tags/autoComplete/*`, `/tags/findSeries` API.

View File

@ -1191,6 +1191,10 @@ The shortlist of configuration flags is the following:
Optional OAuth2 scopes to use for -notifier.url. Scopes must be delimited by ';'.
-remoteWrite.oauth2.tokenUrl string
Optional OAuth2 tokenURL to use for -notifier.url.
-remoteWrite.retryMaxTime duration
The max time spent on retry attempts for the failed remote-write request. Change this value if it is expected for remoteWrite.url to be unreachable for more than -remoteWrite.retryMaxTime. See also -remoteWrite.retryMinInterval (default 30s)
-remoteWrite.retryMinInterval duration
The minimum delay between retry attempts. Every next retry attempt will double the delay to prevent hammering of remote database. See also -remoteWrite.retryMaxInterval (default 1s)
-remoteWrite.sendTimeout duration
Timeout for sending data to the configured -remoteWrite.url. (default 30s)
-remoteWrite.showURL