vmagent: expose metrics for tracking config state (#3375)

Expose `vm_relabel_config_*` and `vm_promscrape_config_*` metrics
for tracking relabel and scrape configuration hot-reloads.

https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3345
Signed-off-by: hagen1778 <roman@victoriametrics.com>

Signed-off-by: hagen1778 <roman@victoriametrics.com>
This commit is contained in:
Roman Khavronenko 2022-11-21 23:38:43 +01:00 committed by Aliaksandr Valialkin
parent 2f9df62795
commit d1169c1559
No known key found for this signature in database
GPG Key ID: A72BEC6CD3D0DED1
4 changed files with 47 additions and 3 deletions

View File

@ -13,6 +13,7 @@ import (
"github.com/VictoriaMetrics/VictoriaMetrics/lib/bloomfilter"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/bytesutil"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/cgroup"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/fasttime"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/flagutil"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/memory"
@ -139,6 +140,8 @@ func Init() {
logger.Fatalf("cannot load relabel configs: %s", err)
}
allRelabelConfigs.Store(rcs)
configSuccess.Set(1)
configTimestamp.Set(fasttime.UnixTimestamp())
if len(*remoteWriteURLs) > 0 {
rwctxsDefault = newRemoteWriteCtxs(nil, *remoteWriteURLs)
@ -154,18 +157,31 @@ func Init() {
case <-stopCh:
return
}
configReloads.Inc()
logger.Infof("SIGHUP received; reloading relabel configs pointed by -remoteWrite.relabelConfig and -remoteWrite.urlRelabelConfig")
rcs, err := loadRelabelConfigs()
if err != nil {
configReloadErrors.Inc()
configSuccess.Set(0)
logger.Errorf("cannot reload relabel configs; preserving the previous configs; error: %s", err)
continue
}
allRelabelConfigs.Store(rcs)
configSuccess.Set(1)
configTimestamp.Set(fasttime.UnixTimestamp())
logger.Infof("Successfully reloaded relabel configs")
}
}()
}
var (
configReloads = metrics.NewCounter(`vm_relabel_config_reloads_total`)
configReloadErrors = metrics.NewCounter(`vm_relabel_config_reloads_errors_total`)
configSuccess = metrics.NewCounter(`vm_relabel_config_last_reload_successful`)
configTimestamp = metrics.NewCounter(`vm_relabel_config_last_reload_success_timestamp_seconds`)
)
func newRemoteWriteCtxs(at *auth.Token, urls []string) []*remoteWriteCtx {
if len(urls) == 0 {
logger.Panicf("BUG: urls must be non-empty")

View File

@ -119,4 +119,16 @@ groups:
dashboard: "http://localhost:3000/d/G7Z9GzMGz?viewPanel=90&var-instance={{ $labels.instance }}"
summary: "Instance {{ $labels.instance }} reached 90% of the limit"
description: "Max series limit set via -remoteWrite.maxDailySeries flag is close to reaching the max value.
Then samples for new time series will be dropped instead of sending them to remote storage systems."
Then samples for new time series will be dropped instead of sending them to remote storage systems."
- alert: ConfigurationReloadFailure
expr: |
vm_promscrape_config_last_reload_successful != 1
or
vm_relabel_config_last_reload_successful != 1
labels:
severity: warning
annotations:
summary: "Configuration reload failed for vmagent instance {{ $labels.instance }}"
description: "Configuration hot-reload failed for vmagent on instance {{ $labels.instance }}.
Check vmagent's logs for detailed error message."

View File

@ -27,6 +27,7 @@ The following tip changes can be tested by building VictoriaMetrics components f
* FEATURE: [vmui](https://docs.victoriametrics.com/#vmui): add copy button to row on Table view. The button copies row in MetricQL format. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/2815).
* FEATURE: [vmui](https://docs.victoriametrics.com/#vmui): add the ability to "stick" a tooltip on the chart by clicking on a data point. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3321) and [this pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/3376)
* FEATURE: [vmalert](https://docs.victoriametrics.com/vmalert.html): add default alert list for vmalert's metrics. See [alerts-vmalert.yml](https://github.com/VictoriaMetrics/VictoriaMetrics/blob/master/deployment/docker/alerts-vmalert.yml).
* FEATURE: [vmagent](https://docs.victoriametrics.com/vmagent.html): expose `vm_relabel_config_*` and `vm_promscrape_config_*` metrics for tracking relabel and scrape configuration hot-reloads. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3345).
* BUGFIX: [MetricsQL](https://docs.victoriametrics.com/MetricsQL.html): properly return an empty result from [limit_offset](https://docs.victoriametrics.com/MetricsQL.html#limit_offset) if the `offset` arg exceeds the number of inner time series. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3312).
* BUGFIX: [vmagent](https://docs.victoriametrics.com/vmagent.html): properly discover GCE zones when `filter` option is set at [gce_sd_configs](https://docs.victoriametrics.com/sd_configs.html#gce_sd_configs). See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3202).

View File

@ -4,12 +4,13 @@ import (
"bytes"
"flag"
"fmt"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/auth"
"io"
"sync"
"sync/atomic"
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/auth"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/fasttime"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/procutil"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
@ -112,6 +113,9 @@ func runScraper(configFile string, pushData func(at *auth.Token, wr *prompbmarsh
configData.Store(&marshaledData)
cfg.mustStart()
configSuccess.Set(1)
configTimestamp.Set(fasttime.UnixTimestamp())
scs := newScrapeConfigs(pushData, globalStopCh)
scs.add("azure_sd_configs", *azure.SDCheckInterval, func(cfg *Config, swsPrev []*ScrapeWork) []*ScrapeWork { return cfg.getAzureSDScrapeWork(swsPrev) })
scs.add("consul_sd_configs", *consul.SDCheckInterval, func(cfg *Config, swsPrev []*ScrapeWork) []*ScrapeWork { return cfg.getConsulSDScrapeWork(swsPrev) })
@ -143,6 +147,8 @@ func runScraper(configFile string, pushData func(at *auth.Token, wr *prompbmarsh
logger.Infof("SIGHUP received; reloading Prometheus configs from %q", configFile)
cfgNew, dataNew, err := loadConfig(configFile)
if err != nil {
configReloadErrors.Inc()
configSuccess.Set(0)
logger.Errorf("cannot read %q on SIGHUP: %s; continuing with the previous config", configFile, err)
goto waitForChans
}
@ -158,6 +164,8 @@ func runScraper(configFile string, pushData func(at *auth.Token, wr *prompbmarsh
case <-tickerCh:
cfgNew, dataNew, err := loadConfig(configFile)
if err != nil {
configReloadErrors.Inc()
configSuccess.Set(0)
logger.Errorf("cannot read %q: %s; continuing with the previous config", configFile, err)
goto waitForChans
}
@ -180,10 +188,17 @@ func runScraper(configFile string, pushData func(at *auth.Token, wr *prompbmarsh
}
logger.Infof("found changes in %q; applying these changes", configFile)
configReloads.Inc()
configSuccess.Set(1)
configTimestamp.Set(fasttime.UnixTimestamp())
}
}
var configReloads = metrics.NewCounter(`vm_promscrape_config_reloads_total`)
var (
configReloads = metrics.NewCounter(`vm_promscrape_config_reloads_total`)
configReloadErrors = metrics.NewCounter(`vm_promscrape_config_reloads_errors_total`)
configSuccess = metrics.NewCounter(`vm_promscrape_config_last_reload_successful`)
configTimestamp = metrics.NewCounter(`vm_promscrape_config_last_reload_success_timestamp_seconds`)
)
type scrapeConfigs struct {
pushData func(at *auth.Token, wr *prompbmarshal.WriteRequest)