mirror of
https://github.com/VictoriaMetrics/VictoriaMetrics.git
synced 2024-12-03 16:21:14 +01:00
app/vmalert: update remote-write process (#5284)
* app/vmalert: update remote-write process * automatically retry remote-write requests on closed connections. The change should reduce the amount of logs produced in environments with short-living connections or environments without support of keep-alive on network balancers. * increment `vmalert_remotewrite_errors_total` metric if all retries to send remote-write request failed. Before, this metric was incremented only if remote-write client's buffer is overloaded. * increment `vmalert_remotewrite_dropped_rows_total` amd `vmalert_remotewrite_dropped_bytes_total` metrics if remote-write client's buffer is overloaded. Before, these metrics were incremented only after unsuccessful HTTP calls. Signed-off-by: hagen1778 <roman@victoriametrics.com> * Update docs/CHANGELOG.md --------- Signed-off-by: hagen1778 <roman@victoriametrics.com> Co-authored-by: Hui Wang <haley@victoriametrics.com>
This commit is contained in:
parent
5afc6a5765
commit
bffd30b57a
@ -3,6 +3,7 @@ package remotewrite
|
|||||||
import (
|
import (
|
||||||
"bytes"
|
"bytes"
|
||||||
"context"
|
"context"
|
||||||
|
"errors"
|
||||||
"flag"
|
"flag"
|
||||||
"fmt"
|
"fmt"
|
||||||
"io"
|
"io"
|
||||||
@ -117,12 +118,19 @@ func NewClient(ctx context.Context, cfg Config) (*Client, error) {
|
|||||||
// Push adds timeseries into queue for writing into remote storage.
|
// Push adds timeseries into queue for writing into remote storage.
|
||||||
// Push returns and error if client is stopped or if queue is full.
|
// Push returns and error if client is stopped or if queue is full.
|
||||||
func (c *Client) Push(s prompbmarshal.TimeSeries) error {
|
func (c *Client) Push(s prompbmarshal.TimeSeries) error {
|
||||||
|
rwTotal.Inc()
|
||||||
select {
|
select {
|
||||||
case <-c.doneCh:
|
case <-c.doneCh:
|
||||||
|
rwErrors.Inc()
|
||||||
|
droppedRows.Add(len(s.Samples))
|
||||||
|
droppedBytes.Add(s.Size())
|
||||||
return fmt.Errorf("client is closed")
|
return fmt.Errorf("client is closed")
|
||||||
case c.input <- s:
|
case c.input <- s:
|
||||||
return nil
|
return nil
|
||||||
default:
|
default:
|
||||||
|
rwErrors.Inc()
|
||||||
|
droppedRows.Add(len(s.Samples))
|
||||||
|
droppedBytes.Add(s.Size())
|
||||||
return fmt.Errorf("failed to push timeseries - queue is full (%d entries). "+
|
return fmt.Errorf("failed to push timeseries - queue is full (%d entries). "+
|
||||||
"Queue size is controlled by -remoteWrite.maxQueueSize flag",
|
"Queue size is controlled by -remoteWrite.maxQueueSize flag",
|
||||||
c.maxQueueSize)
|
c.maxQueueSize)
|
||||||
@ -181,11 +189,14 @@ func (c *Client) run(ctx context.Context) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
var (
|
var (
|
||||||
|
rwErrors = metrics.NewCounter(`vmalert_remotewrite_errors_total`)
|
||||||
|
rwTotal = metrics.NewCounter(`vmalert_remotewrite_total`)
|
||||||
|
|
||||||
sentRows = metrics.NewCounter(`vmalert_remotewrite_sent_rows_total`)
|
sentRows = metrics.NewCounter(`vmalert_remotewrite_sent_rows_total`)
|
||||||
sentBytes = metrics.NewCounter(`vmalert_remotewrite_sent_bytes_total`)
|
sentBytes = metrics.NewCounter(`vmalert_remotewrite_sent_bytes_total`)
|
||||||
sendDuration = metrics.NewFloatCounter(`vmalert_remotewrite_send_duration_seconds_total`)
|
|
||||||
droppedRows = metrics.NewCounter(`vmalert_remotewrite_dropped_rows_total`)
|
droppedRows = metrics.NewCounter(`vmalert_remotewrite_dropped_rows_total`)
|
||||||
droppedBytes = metrics.NewCounter(`vmalert_remotewrite_dropped_bytes_total`)
|
droppedBytes = metrics.NewCounter(`vmalert_remotewrite_dropped_bytes_total`)
|
||||||
|
sendDuration = metrics.NewFloatCounter(`vmalert_remotewrite_send_duration_seconds_total`)
|
||||||
bufferFlushDuration = metrics.NewHistogram(`vmalert_remotewrite_flush_duration_seconds`)
|
bufferFlushDuration = metrics.NewHistogram(`vmalert_remotewrite_flush_duration_seconds`)
|
||||||
|
|
||||||
_ = metrics.NewGauge(`vmalert_remotewrite_concurrency`, func() float64 {
|
_ = metrics.NewGauge(`vmalert_remotewrite_concurrency`, func() float64 {
|
||||||
@ -222,6 +233,11 @@ func (c *Client) flush(ctx context.Context, wr *prompbmarshal.WriteRequest) {
|
|||||||
L:
|
L:
|
||||||
for attempts := 0; ; attempts++ {
|
for attempts := 0; ; attempts++ {
|
||||||
err := c.send(ctx, b)
|
err := c.send(ctx, b)
|
||||||
|
if errors.Is(err, io.EOF) {
|
||||||
|
// Something in the middle between client and destination might be closing
|
||||||
|
// the connection. So we do a one more attempt in hope request will succeed.
|
||||||
|
err = c.send(ctx, b)
|
||||||
|
}
|
||||||
if err == nil {
|
if err == nil {
|
||||||
sentRows.Add(len(wr.Timeseries))
|
sentRows.Add(len(wr.Timeseries))
|
||||||
sentBytes.Add(len(b))
|
sentBytes.Add(len(b))
|
||||||
@ -259,6 +275,7 @@ L:
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
rwErrors.Inc()
|
||||||
droppedRows.Add(len(wr.Timeseries))
|
droppedRows.Add(len(wr.Timeseries))
|
||||||
droppedBytes.Add(len(b))
|
droppedBytes.Add(len(b))
|
||||||
logger.Errorf("attempts to send remote-write request failed - dropping %d time series",
|
logger.Errorf("attempts to send remote-write request failed - dropping %d time series",
|
||||||
@ -303,7 +320,7 @@ func (c *Client) send(ctx context.Context, data []byte) error {
|
|||||||
// Prometheus remote Write compatible receivers MUST
|
// Prometheus remote Write compatible receivers MUST
|
||||||
switch resp.StatusCode / 100 {
|
switch resp.StatusCode / 100 {
|
||||||
case 2:
|
case 2:
|
||||||
// respond with a HTTP 2xx status code when the write is successful.
|
// respond with HTTP 2xx status code when write is successful.
|
||||||
return nil
|
return nil
|
||||||
case 4:
|
case 4:
|
||||||
if resp.StatusCode != http.StatusTooManyRequests {
|
if resp.StatusCode != http.StatusTooManyRequests {
|
||||||
|
@ -663,9 +663,6 @@ var (
|
|||||||
|
|
||||||
execTotal = metrics.NewCounter(`vmalert_execution_total`)
|
execTotal = metrics.NewCounter(`vmalert_execution_total`)
|
||||||
execErrors = metrics.NewCounter(`vmalert_execution_errors_total`)
|
execErrors = metrics.NewCounter(`vmalert_execution_errors_total`)
|
||||||
|
|
||||||
remoteWriteErrors = metrics.NewCounter(`vmalert_remotewrite_errors_total`)
|
|
||||||
remoteWriteTotal = metrics.NewCounter(`vmalert_remotewrite_total`)
|
|
||||||
)
|
)
|
||||||
|
|
||||||
func (e *executor) exec(ctx context.Context, r Rule, ts time.Time, resolveDuration time.Duration, limit int) error {
|
func (e *executor) exec(ctx context.Context, r Rule, ts time.Time, resolveDuration time.Duration, limit int) error {
|
||||||
@ -686,9 +683,7 @@ func (e *executor) exec(ctx context.Context, r Rule, ts time.Time, resolveDurati
|
|||||||
pushToRW := func(tss []prompbmarshal.TimeSeries) error {
|
pushToRW := func(tss []prompbmarshal.TimeSeries) error {
|
||||||
var lastErr error
|
var lastErr error
|
||||||
for _, ts := range tss {
|
for _, ts := range tss {
|
||||||
remoteWriteTotal.Inc()
|
|
||||||
if err := e.Rw.Push(ts); err != nil {
|
if err := e.Rw.Push(ts); err != nil {
|
||||||
remoteWriteErrors.Inc()
|
|
||||||
lastErr = fmt.Errorf("rule %q: remote write failure: %w", r, err)
|
lastErr = fmt.Errorf("rule %q: remote write failure: %w", r, err)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -56,6 +56,7 @@ The sandbox cluster installation is running under the constant load generated by
|
|||||||
* FEATURE: [vmalert](https://docs.victoriametrics.com/vmalert.html): allow specifying full url in notifier static_configs target address, like `http://alertmanager:9093/test/api/v2/alerts`. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5184).
|
* FEATURE: [vmalert](https://docs.victoriametrics.com/vmalert.html): allow specifying full url in notifier static_configs target address, like `http://alertmanager:9093/test/api/v2/alerts`. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5184).
|
||||||
* FEATURE: [vmalert](https://docs.victoriametrics.com/vmalert.html): reduce the number of queries for restoring alerts state on start-up. The change should speed up the restore process and reduce pressure on `remoteRead.url`. See [this pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5265).
|
* FEATURE: [vmalert](https://docs.victoriametrics.com/vmalert.html): reduce the number of queries for restoring alerts state on start-up. The change should speed up the restore process and reduce pressure on `remoteRead.url`. See [this pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5265).
|
||||||
* FEATURE: [vmalert](https://docs.victoriametrics.com/vmalert.html): add label `file` pointing to the group's filename to metrics `vmalert_recording_.*` and `vmalert_alerts_.*`. The filename should help identifying alerting rules belonging to specific groups with identical names but different filenames. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5267).
|
* FEATURE: [vmalert](https://docs.victoriametrics.com/vmalert.html): add label `file` pointing to the group's filename to metrics `vmalert_recording_.*` and `vmalert_alerts_.*`. The filename should help identifying alerting rules belonging to specific groups with identical names but different filenames. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5267).
|
||||||
|
* FEATURE: [vmalert](https://docs.victoriametrics.com/vmalert.html): automatically retry remote-write requests on closed connections. The change should reduce the amount of logs produced in environments with short-living connections or environments without support of keep-alive on network balancers.
|
||||||
* FEATURE: [vmagent](https://docs.victoriametrics.com/vmagent.html): support data ingestion from [NewRelic infrastructure agent](https://docs.newrelic.com/docs/infrastructure/install-infrastructure-agent). See [these docs](https://docs.victoriametrics.com/Single-server-VictoriaMetrics.html#how-to-send-data-from-newrelic-agent), [this feature request](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3520) and [this pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/4712).
|
* FEATURE: [vmagent](https://docs.victoriametrics.com/vmagent.html): support data ingestion from [NewRelic infrastructure agent](https://docs.newrelic.com/docs/infrastructure/install-infrastructure-agent). See [these docs](https://docs.victoriametrics.com/Single-server-VictoriaMetrics.html#how-to-send-data-from-newrelic-agent), [this feature request](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3520) and [this pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/4712).
|
||||||
* FEATURE: [vmagent](https://docs.victoriametrics.com/vmagent.html): add `-remoteWrite.shardByURL.labels` command-line flag, which can be used for specifying a list of labels for sharding outgoing samples among the configured `-remoteWrite.url` destinations if `-remoteWrite.shardByURL` command-line flag is set. See [these docs](https://docs.victoriametrics.com/vmagent.html#sharding-among-remote-storages) and [this feature request](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4942) for details.
|
* FEATURE: [vmagent](https://docs.victoriametrics.com/vmagent.html): add `-remoteWrite.shardByURL.labels` command-line flag, which can be used for specifying a list of labels for sharding outgoing samples among the configured `-remoteWrite.url` destinations if `-remoteWrite.shardByURL` command-line flag is set. See [these docs](https://docs.victoriametrics.com/vmagent.html#sharding-among-remote-storages) and [this feature request](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4942) for details.
|
||||||
* FEATURE: [vmagent](https://docs.victoriametrics.com/vmagent.html): do not exit on startup when [scrape_configs](https://docs.victoriametrics.com/sd_configs.html#scrape_configs) refer to non-existing or invalid files with auth configs, since these files may appear / updated later. See [this feature request](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4959) and [this pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5153).
|
* FEATURE: [vmagent](https://docs.victoriametrics.com/vmagent.html): do not exit on startup when [scrape_configs](https://docs.victoriametrics.com/sd_configs.html#scrape_configs) refer to non-existing or invalid files with auth configs, since these files may appear / updated later. See [this feature request](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4959) and [this pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5153).
|
||||||
@ -82,6 +83,8 @@ The sandbox cluster installation is running under the constant load generated by
|
|||||||
* BUGFIX: [vmalert](https://docs.victoriametrics.com/vmalert.html): fix vmalert web UI when running on 32-bit architectures machine.
|
* BUGFIX: [vmalert](https://docs.victoriametrics.com/vmalert.html): fix vmalert web UI when running on 32-bit architectures machine.
|
||||||
* BUGFIX: [vmalert](https://docs.victoriametrics.com/vmalert.html): do not send requests to configured remote systems when `-datasource.*`, `-remoteWrite.*`, `-remoteRead.*` or `-notifier.*` command-line flags refer files with invalid auth configs. Previously such requests were sent without properly set auth headers. Now the requests are sent only after the files are updated with valid auth configs. See [this pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5153).
|
* BUGFIX: [vmalert](https://docs.victoriametrics.com/vmalert.html): do not send requests to configured remote systems when `-datasource.*`, `-remoteWrite.*`, `-remoteRead.*` or `-notifier.*` command-line flags refer files with invalid auth configs. Previously such requests were sent without properly set auth headers. Now the requests are sent only after the files are updated with valid auth configs. See [this pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5153).
|
||||||
* BUGFIX: [vmalert](https://docs.victoriametrics.com/vmalert.html): properly maintain alerts state in [replay mode](https://docs.victoriametrics.com/vmalert.html#rules-backfilling) if alert's `for` param was bigger than replay request range (usually a couple of hours). See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5186) for details.
|
* BUGFIX: [vmalert](https://docs.victoriametrics.com/vmalert.html): properly maintain alerts state in [replay mode](https://docs.victoriametrics.com/vmalert.html#rules-backfilling) if alert's `for` param was bigger than replay request range (usually a couple of hours). See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5186) for details.
|
||||||
|
* BUGFIX: [vmalert](https://docs.victoriametrics.com/vmalert.html): increment `vmalert_remotewrite_errors_total` metric if all retries to send remote-write request failed. Before, this metric was incremented only if remote-write client's buffer is overloaded.
|
||||||
|
* BUGFIX: [vmalert](https://docs.victoriametrics.com/vmalert.html): increment `vmalert_remotewrite_dropped_rows_total` and `vmalert_remotewrite_dropped_bytes_total` metrics if remote-write client's buffer is overloaded. Before, these metrics were incremented only after unsuccessful HTTP calls.
|
||||||
* BUGFIX: `vmselect`: improve performance and memory usage during query processing on machines with big number of CPU cores. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5087).
|
* BUGFIX: `vmselect`: improve performance and memory usage during query processing on machines with big number of CPU cores. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5087).
|
||||||
* BUGFIX: dashboards: fix vminsert/vmstorage/vmselect metrics filtering when dashboard is used to display data from many sub-clusters with unique job names. Before, only one specific job could have been accounted for component-specific panels, instead of all available jobs for the component.
|
* BUGFIX: dashboards: fix vminsert/vmstorage/vmselect metrics filtering when dashboard is used to display data from many sub-clusters with unique job names. Before, only one specific job could have been accounted for component-specific panels, instead of all available jobs for the component.
|
||||||
* BUGFIX: dashboards/vmalert: apply `desc` sorting in tooltips for vmalert dashboard in order to improve visibility of the outliers on graph.
|
* BUGFIX: dashboards/vmalert: apply `desc` sorting in tooltips for vmalert dashboard in order to improve visibility of the outliers on graph.
|
||||||
|
Loading…
Reference in New Issue
Block a user