From f3d4671bb6b594dd789d8b8ea84c0df3ecbc5963 Mon Sep 17 00:00:00 2001 From: Aliaksandr Valialkin Date: Tue, 12 Apr 2022 12:36:17 +0300 Subject: [PATCH] lib/promscrape: follow-up after 7e79adfb555b65eeb35fd482dbb95a0002d63cea --- app/vmagent/README.md | 2 ++ docs/CHANGELOG.md | 13 ++++++++----- docs/vmagent.md | 2 ++ lib/promscrape/config.go | 6 ++---- lib/promscrape/scraper.go | 4 +--- lib/promscrape/scrapework.go | 2 +- 6 files changed, 16 insertions(+), 13 deletions(-) diff --git a/app/vmagent/README.md b/app/vmagent/README.md index ca94f705f..d8f35c5e2 100644 --- a/app/vmagent/README.md +++ b/app/vmagent/README.md @@ -358,6 +358,8 @@ spread scrape targets among a cluster of two `vmagent` instances: /path/to/vmagent -promscrape.cluster.membersCount=2 -promscrape.cluster.memberNum=1 -promscrape.config=/path/to/config.yml ... ``` +The `-promscrape.cluster.memberNum` can be set to a StatefulSet pod name when `vmagent` runs in Kubernetes. The pod name must end with a number in the range `0 ... promscrape.cluster.memberNum-1`. For example, `-promscrape.cluster.memberNum=vmagent-0`. + By default each scrape target is scraped only by a single `vmagent` instance in the cluster. If there is a need for replicating scrape targets among multiple `vmagent` instances, then `-promscrape.cluster.replicationFactor` command-line flag must be set to the desired number of replicas. For example, the following commands start a cluster of three `vmagent` instances, where each target is scraped by two `vmagent` instances: diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md index 62043a5cb..d50c21667 100644 --- a/docs/CHANGELOG.md +++ b/docs/CHANGELOG.md @@ -15,12 +15,15 @@ The following tip changes can be tested by building VictoriaMetrics components f ## tip -FEATURE: [vmalert](https://docs.victoriametrics.com/vmalert.html): add support for `alert_relabel_configs` option at `-notifier.config`. This option allows configuring relabeling rules for alerts before sending them to configured notifiers. See [these docs](https://docs.victoriametrics.com/vmalert.html#notifier-configuration-file) for details. +**Update notes:** this release introduces backwards-incompatible changes to communication protocol between `vmselect` and `vmstorage` nodes in cluster version of VictoriaMetrics, so `vmselect` and `vmstorage` nodes may log communication errors during the upgrade. These errors should stop after all the `vmselect` and `vmstorage` nodes are updated to new release. -BUGFIX: properly propagate limits at `-search.max*` command-line flags from `vminsert` to `vmstorage`. The limits are `-search.maxUniqueTimeseries`, `-search.maxSeries`, `-search.maxFederateSeries`, `-search.maxExportSeries`, `-search.maxGraphiteSeries` and `-search.maxTSDBStatusSeries`. They weren't propagated to `vmstorage` because of the bug. These limits were introduced in [v1.76.0](https://docs.victoriametrics.com/CHANGELOG.html#v1760). -BUGFIX: fix goroutine leak and possible deadlock when importing invalid data via [native binary format](https://docs.victoriametrics.com/#how-to-import-data-in-native-format). See [this pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/2423). -BUGFIX: [Graphite Render API](https://docs.victoriametrics.com/#graphite-render-api-usage): properly calculate [hitCount](https://graphite.readthedocs.io/en/latest/functions.html#graphite.render.functions.hitcount) function. Previously it could return empty results if there were no original samples in some parts of the selected time range. -BUGFIX: [MetricsQL](https://docs.victoriametrics.com/MetricsQL.html): allow overriding built-in function names inside [WITH templates](https://play.victoriametrics.com/promql/expand-with-exprs). For example, `WITH (sum(a,b) = a + b + 1) sum(x,y)` now expands into `x + y + 1`. Previously such a query would fail with `cannot use reserved name` error. See [this bugreport](https://github.com/VictoriaMetrics/metricsql/issues/5). +* FEATURE: [vmalert](https://docs.victoriametrics.com/vmalert.html): add support for `alert_relabel_configs` option at `-notifier.config`. This option allows configuring relabeling rules for alerts before sending them to configured notifiers. See [these docs](https://docs.victoriametrics.com/vmalert.html#notifier-configuration-file) for details. +* FEATURE: [vmagent](https://docs.victoriametrics.com/vmalert.html): allow passing StatefulSet pod names to `-promscrape.cluster.memberNum` command-line flag. In this case the member number is automatically extracted from the pod name, which must end with the number in the range `0 ... promscrape.cluster.membersCount-1`. For example, `vmagent-0`, `vmagent-1`, etc. See [this feature request](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/2359) and [these docs](https://docs.victoriametrics.com/vmagent.html#scraping-big-number-of-targets). + +* BUGFIX: [VictoriaMetrics cluster](https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html): properly propagate limits at `-search.max*` command-line flags from `vminsert` to `vmstorage`. The limits are `-search.maxUniqueTimeseries`, `-search.maxSeries`, `-search.maxFederateSeries`, `-search.maxExportSeries`, `-search.maxGraphiteSeries` and `-search.maxTSDBStatusSeries`. They weren't propagated to `vmstorage` because of the bug. These limits were introduced in [v1.76.0](https://docs.victoriametrics.com/CHANGELOG.html#v1760). +* BUGFIX: fix goroutine leak and possible deadlock when importing invalid data via [native binary format](https://docs.victoriametrics.com/#how-to-import-data-in-native-format). See [this pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/2423). +* BUGFIX: [Graphite Render API](https://docs.victoriametrics.com/#graphite-render-api-usage): properly calculate [hitCount](https://graphite.readthedocs.io/en/latest/functions.html#graphite.render.functions.hitcount) function. Previously it could return empty results if there were no original samples in some parts of the selected time range. +* BUGFIX: [MetricsQL](https://docs.victoriametrics.com/MetricsQL.html): allow overriding built-in function names inside [WITH templates](https://play.victoriametrics.com/promql/expand-with-exprs). For example, `WITH (sum(a,b) = a + b + 1) sum(x,y)` now expands into `x + y + 1`. Previously such a query would fail with `cannot use reserved name` error. See [this bugreport](https://github.com/VictoriaMetrics/metricsql/issues/5). ## [v1.76.0](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.76.0) diff --git a/docs/vmagent.md b/docs/vmagent.md index 163c8e02e..651b5c9d2 100644 --- a/docs/vmagent.md +++ b/docs/vmagent.md @@ -362,6 +362,8 @@ spread scrape targets among a cluster of two `vmagent` instances: /path/to/vmagent -promscrape.cluster.membersCount=2 -promscrape.cluster.memberNum=1 -promscrape.config=/path/to/config.yml ... ``` +The `-promscrape.cluster.memberNum` can be set to a StatefulSet pod name when `vmagent` runs in Kubernetes. The pod name must end with a number in the range `0 ... promscrape.cluster.memberNum-1`. For example, `-promscrape.cluster.memberNum=vmagent-0`. + By default each scrape target is scraped only by a single `vmagent` instance in the cluster. If there is a need for replicating scrape targets among multiple `vmagent` instances, then `-promscrape.cluster.replicationFactor` command-line flag must be set to the desired number of replicas. For example, the following commands start a cluster of three `vmagent` instances, where each target is scraped by two `vmagent` instances: diff --git a/lib/promscrape/config.go b/lib/promscrape/config.go index 52c32858f..15e2feef6 100644 --- a/lib/promscrape/config.go +++ b/lib/promscrape/config.go @@ -58,8 +58,7 @@ var ( var clusterMemberID int -// must be called before any scraper -func initClusterMemberID() error { +func mustInitClusterMemberID() { s := *clusterMemberNum // special case for kubernetes deployment, where pod-name formatted at some-pod-name-1 // obtain memberNum from last segment @@ -69,10 +68,9 @@ func initClusterMemberID() error { } n, err := strconv.ParseInt(s, 10, 64) if err != nil { - return fmt.Errorf("cannot parse -promscrape.cluster.memberNum=%q: %w", *clusterMemberNum, err) + logger.Fatalf("cannot parse -promscrape.cluster.memberNum=%q: %s", *clusterMemberNum, err) } clusterMemberID = int(n) - return nil } // Config represents essential parts from Prometheus config defined at https://prometheus.io/docs/prometheus/latest/configuration/configuration/ diff --git a/lib/promscrape/scraper.go b/lib/promscrape/scraper.go index c6415d416..0ca5511d9 100644 --- a/lib/promscrape/scraper.go +++ b/lib/promscrape/scraper.go @@ -52,6 +52,7 @@ func CheckConfig() error { // // Scraped data is passed to pushData. func Init(pushData func(wr *prompbmarshal.WriteRequest)) { + mustInitClusterMemberID() globalStopChan = make(chan struct{}) scraperWG.Add(1) go func() { @@ -99,9 +100,6 @@ func runScraper(configFile string, pushData func(wr *prompbmarshal.WriteRequest) // See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1240 sighupCh := procutil.NewSighupChan() - if err := initClusterMemberID(); err != nil { - logger.Fatalf("cannot init clusterMembership: %s", err) - } logger.Infof("reading Prometheus configs from %q", configFile) cfg, data, err := loadConfig(configFile) if err != nil { diff --git a/lib/promscrape/scrapework.go b/lib/promscrape/scrapework.go index aef8f85b1..3e20633ca 100644 --- a/lib/promscrape/scrapework.go +++ b/lib/promscrape/scrapework.go @@ -276,7 +276,7 @@ func (sw *scrapeWork) run(stopCh <-chan struct{}, globalStopCh <-chan struct{}) // scrapes replicated targets at different time offsets. This guarantees that the deduplication consistently leaves samples // received from the same vmagent replica. // See https://docs.victoriametrics.com/vmagent.html#scraping-big-number-of-targets - key := fmt.Sprintf("ClusterMemberNum=%d, ScrapeURL=%s, Labels=%s", *clusterMemberNum, sw.Config.ScrapeURL, sw.Config.LabelsString()) + key := fmt.Sprintf("ClusterMemberNum=%d, ScrapeURL=%s, Labels=%s", clusterMemberID, sw.Config.ScrapeURL, sw.Config.LabelsString()) h := xxhash.Sum64(bytesutil.ToUnsafeBytes(key)) randSleep = uint64(float64(scrapeInterval) * (float64(h) / (1 << 64))) sleepOffset := uint64(time.Now().UnixNano()) % uint64(scrapeInterval)