mirror of
https://github.com/VictoriaMetrics/VictoriaMetrics.git
synced 2024-11-23 12:31:07 +01:00
vmalert: update retry policy for pushing data to -remoteWrite.url
(#4504)
By default, vmalert will make multiple retry attempts with exponential delay. The total time spent during retry attempts shouldn't exceed `-remoteWrite.retryMaxTime` (default is 30s). When retry time is exceeded vmalert drops the data dedicated for `-remoteWrite.url`. Before, vmalert dropped data after 5 retry attempts with 1s delay between attempts (not configurable). See `-remoteWrite.retryMinInterval` and `-remoteWrite.retryMaxTime` cmd-line flags. Signed-off-by: hagen1778 <roman@victoriametrics.com> Co-authored-by: Nikolay <nik@victoriametrics.com>
This commit is contained in:
parent
4aad7a43df
commit
5f9ad22884
@ -1180,6 +1180,10 @@ The shortlist of configuration flags is the following:
|
||||
Optional OAuth2 scopes to use for -notifier.url. Scopes must be delimited by ';'.
|
||||
-remoteWrite.oauth2.tokenUrl string
|
||||
Optional OAuth2 tokenURL to use for -notifier.url.
|
||||
-remoteWrite.retryMaxTime duration
|
||||
The max time spent on retry attempts for the failed remote-write request. Change this value if it is expected for remoteWrite.url to be unreachable for more than -remoteWrite.retryMaxTime. See also -remoteWrite.retryMinInterval (default 30s)
|
||||
-remoteWrite.retryMinInterval duration
|
||||
The minimum delay between retry attempts. Every next retry attempt will double the delay to prevent hammering of remote database. See also -remoteWrite.retryMaxInterval (default 1s)
|
||||
-remoteWrite.sendTimeout duration
|
||||
Timeout for sending data to the configured -remoteWrite.url. (default 30s)
|
||||
-remoteWrite.showURL
|
||||
|
@ -23,6 +23,8 @@ import (
|
||||
var (
|
||||
disablePathAppend = flag.Bool("remoteWrite.disablePathAppend", false, "Whether to disable automatic appending of '/api/v1/write' path to the configured -remoteWrite.url.")
|
||||
sendTimeout = flag.Duration("remoteWrite.sendTimeout", 30*time.Second, "Timeout for sending data to the configured -remoteWrite.url.")
|
||||
retryMinInterval = flag.Duration("remoteWrite.retryMinInterval", time.Second, "The minimum delay between retry attempts. Every next retry attempt will double the delay to prevent hammering of remote database. See also -remoteWrite.retryMaxInterval")
|
||||
retryMaxTime = flag.Duration("remoteWrite.retryMaxTime", time.Second*30, "The max time spent on retry attempts for the failed remote-write request. Change this value if it is expected for remoteWrite.url to be unreachable for more than -remoteWrite.retryMaxTime. See also -remoteWrite.retryMinInterval")
|
||||
)
|
||||
|
||||
// Client is an asynchronous HTTP client for writing
|
||||
@ -186,11 +188,6 @@ var (
|
||||
bufferFlushDuration = metrics.NewHistogram(`vmalert_remotewrite_flush_duration_seconds`)
|
||||
)
|
||||
|
||||
var (
|
||||
retryCount = 5
|
||||
retryBackoff = time.Second
|
||||
)
|
||||
|
||||
// flush is a blocking function that marshals WriteRequest and sends
|
||||
// it to remote-write endpoint. Flush performs limited amount of retries
|
||||
// if request fails.
|
||||
@ -208,8 +205,14 @@ func (c *Client) flush(ctx context.Context, wr *prompbmarshal.WriteRequest) {
|
||||
}
|
||||
|
||||
b := snappy.Encode(nil, data)
|
||||
L:
|
||||
for attempts := 0; attempts < retryCount; attempts++ {
|
||||
|
||||
retryInterval, maxRetryInterval := *retryMinInterval, *retryMaxTime
|
||||
if retryInterval > maxRetryInterval {
|
||||
retryInterval = maxRetryInterval
|
||||
}
|
||||
timeStart := time.Now()
|
||||
L:
|
||||
for attempts := 0; ; attempts++ {
|
||||
err := c.send(ctx, b)
|
||||
if err == nil {
|
||||
sentRows.Add(len(wr.Timeseries))
|
||||
@ -233,8 +236,19 @@ L:
|
||||
default:
|
||||
}
|
||||
|
||||
// sleeping to avoid remote db hammering
|
||||
time.Sleep(retryBackoff)
|
||||
timeLeftForRetries := maxRetryInterval - time.Since(timeStart)
|
||||
if timeLeftForRetries < 0 {
|
||||
// the max retry time has passed, so we give up
|
||||
break
|
||||
}
|
||||
|
||||
if retryInterval > timeLeftForRetries {
|
||||
retryInterval = timeLeftForRetries
|
||||
}
|
||||
// sleeping to prevent remote db hammering
|
||||
time.Sleep(retryInterval)
|
||||
retryInterval *= 2
|
||||
|
||||
}
|
||||
|
||||
droppedRows.Add(len(wr.Timeseries))
|
||||
|
@ -19,10 +19,10 @@ import (
|
||||
)
|
||||
|
||||
func TestClient_Push(t *testing.T) {
|
||||
oldRetryBackoff := retryBackoff
|
||||
retryBackoff = time.Millisecond * 10
|
||||
oldMinInterval := *retryMinInterval
|
||||
*retryMinInterval = time.Millisecond * 10
|
||||
defer func() {
|
||||
retryBackoff = oldRetryBackoff
|
||||
*retryMinInterval = oldMinInterval
|
||||
}()
|
||||
|
||||
testSrv := newRWServer()
|
||||
|
@ -33,6 +33,7 @@ The following tip changes can be tested by building VictoriaMetrics components f
|
||||
* FEATURE: vmstorage: suppress "broken pipe" and "connection reset by peer" errors for search queries on vmstorage side. See [this](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/4418/commits/a6a7795b9e1f210d614a2c5f9a3016b97ded4792) and [this](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/4498/commits/830dac177f0f09032165c248943a5da0e10dfe90) commits.
|
||||
* FEATURE: [Official Grafana dashboards for VictoriaMetrics](https://grafana.com/orgs/victoriametrics): add panel for tracking rate of syscalls while writing or reading from disk via `process_io_(read|write)_syscalls_total` metrics.
|
||||
* FEATURE: accept timestamps in milliseconds at `start`, `end` and `time` query args in [Prometheus querying API](https://docs.victoriametrics.com/#prometheus-querying-api-usage). See [these docs](https://docs.victoriametrics.com/#timestamp-formats) and [this feature request](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4459).
|
||||
* FEATURE: [vmalert](https://docs.victoriametrics.com/vmalert.html): update retry policy for pushing data to `-remoteWrite.url`. By default, vmalert will make multiple retry attempts with exponential delay. The total time spent during retry attempts shouldn't exceed `-remoteWrite.retryMaxTime` (default is 30s). When retry time is exceeded vmalert drops the data dedicated for `-remoteWrite.url`. Before, vmalert dropped data after 5 retry attempts with 1s delay between attempts (not configurable). See `-remoteWrite.retryMinInterval` and `-remoteWrite.retryMaxTime` cmd-line flags.
|
||||
|
||||
* BUGFIX: add the following command-line flags, which can be used for limiting Graphite API calls:
|
||||
`--search.maxGraphiteTagKeys` for limiting the number of tag keys returned from Graphite `/tags`, `/tags/autoComplete/*`, `/tags/findSeries` API.
|
||||
|
@ -1191,6 +1191,10 @@ The shortlist of configuration flags is the following:
|
||||
Optional OAuth2 scopes to use for -notifier.url. Scopes must be delimited by ';'.
|
||||
-remoteWrite.oauth2.tokenUrl string
|
||||
Optional OAuth2 tokenURL to use for -notifier.url.
|
||||
-remoteWrite.retryMaxTime duration
|
||||
The max time spent on retry attempts for the failed remote-write request. Change this value if it is expected for remoteWrite.url to be unreachable for more than -remoteWrite.retryMaxTime. See also -remoteWrite.retryMinInterval (default 30s)
|
||||
-remoteWrite.retryMinInterval duration
|
||||
The minimum delay between retry attempts. Every next retry attempt will double the delay to prevent hammering of remote database. See also -remoteWrite.retryMaxInterval (default 1s)
|
||||
-remoteWrite.sendTimeout duration
|
||||
Timeout for sending data to the configured -remoteWrite.url. (default 30s)
|
||||
-remoteWrite.showURL
|
||||
|
Loading…
Reference in New Issue
Block a user