From 49fa92c1d0e6cfcbf39429a4bff49efa9a6b6961 Mon Sep 17 00:00:00 2001
From: Hui Wang <haley@victoriametrics.com>
Date: Mon, 22 Jan 2024 05:13:15 +0800
Subject: [PATCH] lib/promscrape/discovery/kubernetes: fix watcher start order
 for roles endpoints and endpointslice (#5557)

* lib/promscrape/discovery/kubernetes: fix watcher start order for roles endpoints and endpointslice

Previously the groupWatcher could be mistakenly stopped when requests for pod or services resources take too long.

* remove mislead comment

* docs/sd_configs.md: mention -promscrape.kubernetes.attachNodeMetadataAll flag in the description for attach_metadata section

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4640

* wip

* lib/promscrape/kubernetes: prevent from stopping groupWatcher when there are in-flight apiWatcher.mustStart() calls

groupWatcher is stopped if it has zero registered apiWatchers during 14 seconds.
But such a groupWatcher can be still in use if apiWatcher for `role: endpoints` or `role: endpointslice`
is being registered and the discovery of the associated `pod` and/or `service` objects takes longer
than 14 seconds - see the beginning of groupWatcher.startWatchersForRole() function for details.

Track the number of in-flight calls to apiWatcher.mustStart() and prevent from stopping the associated groupWatcher
if the number of in-flight calls is non-zero.

P.S. postponing the discovery of `pod` and/or `service` objects associated with `endpoints` or `endpointslice` roles
isn't the best solution, since it slows down initial discovery of `endpoints` and `endpointslice` targets.

* typo fix

---------

Co-authored-by: Aliaksandr Valialkin <valyala@victoriametrics.com>
---
 docs/CHANGELOG.md                             |  1 +
 .../discovery/kubernetes/api_watcher.go       | 30 ++++++++++++-------
 2 files changed, 20 insertions(+), 11 deletions(-)

diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md
index 76f1082de3..3ef90abe5f 100644
--- a/docs/CHANGELOG.md
+++ b/docs/CHANGELOG.md
@@ -67,6 +67,7 @@ The sandbox cluster installation is running under the constant load generated by
 * BUGFIX: [vmctl](https://docs.victoriametrics.com/vmctl.html): retry on import errors in `vm-native` mode. Before, retries happened only on writes into a network connection between source and destination. But errors returned by server after all the data was transmitted were logged, but not retried.
 * BUGFIX: [vmagent](https://docs.victoriametrics.com/vmagent.html): properly assume role with [AWS IRSA authorization](https://docs.aws.amazon.com/eks/latest/userguide/iam-roles-for-service-accounts.html). Previously role chaining was not supported. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3822) for details.
 * BUGFIX: [vmagent](https://docs.victoriametrics.com/vmagent.html): exit if there is config syntax error in [`scrape_config_files`](https://docs.victoriametrics.com/vmagent.html#loading-scrape-configs-from-multiple-files) when `-promscrape.config.strictParse=true`. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5508).
+* BUGFIX: [vmagent](https://docs.victoriametrics.com/vmagent.html): properly discover targets for `role: endpoints` and `role: endpointslice` in [kubernetes_sd_configs](https://docs.victoriametrics.com/sd_configs.html#kubernetes_sd_configs). Previously some `endpoints` and `endpointslice` targets could be left undiscovered or some targets could have missing `__meta_*` labels when performing service discovery in busy Kubernetes clusters with large number of pods. See [this pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5557).
 * BUGFIX: [vmagent](https://docs.victoriametrics.com/vmagent.html): do not store scrape response for target in memory when staleness markers are disabled. See [this pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5577) for details.
 * BUGFIX: [vmui](https://docs.victoriametrics.com/#vmui): fix a link for the statistic inaccuracy explanation in the cardinality explorer tool. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5460).
 * BUGFIX: [vmui](https://docs.victoriametrics.com/#vmui): send `step` param for instant queries. The change reverts [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3896) due to reasons explained in [this comment](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3896#issuecomment-1896704401).
diff --git a/lib/promscrape/discovery/kubernetes/api_watcher.go b/lib/promscrape/discovery/kubernetes/api_watcher.go
index 4b9a12cad6..a27ad3c5fc 100644
--- a/lib/promscrape/discovery/kubernetes/api_watcher.go
+++ b/lib/promscrape/discovery/kubernetes/api_watcher.go
@@ -103,7 +103,9 @@ func newAPIWatcher(apiServer string, ac *promauth.Config, sdc *SDConfig, swcFunc
 }
 
 func (aw *apiWatcher) mustStart() {
+	atomic.AddInt32(&aw.gw.apiWatcherInflightStartCalls, 1)
 	aw.gw.startWatchersForRole(aw.role, aw)
+	atomic.AddInt32(&aw.gw.apiWatcherInflightStartCalls, -1)
 }
 
 func (aw *apiWatcher) updateSwosCount(multiplier int, swosByKey map[string][]interface{}) {
@@ -209,6 +211,10 @@ func (aw *apiWatcher) getScrapeWorkObjects() []interface{} {
 // groupWatcher watches for Kubernetes objects on the given apiServer with the given namespaces,
 // selectors and attachNodeMetadata using the given client.
 type groupWatcher struct {
+	// The number of in-flight apiWatcher.mustStart() calls for the given groupWatcher.
+	// This field is used by groupWatchersCleaner() in order to determine when the given groupWatcher can be stopped.
+	apiWatcherInflightStartCalls int32
+
 	// Old Kubernetes doesn't support /apis/networking.k8s.io/v1/, so /apis/networking.k8s.io/v1beta1/ must be used instead.
 	// This flag is used for automatic substitution of v1 API path with v1beta1 API path during requests to apiServer.
 	useNetworkingV1Beta1 uint32
@@ -309,11 +315,7 @@ func selectorsKey(selectors []Selector) string {
 
 var (
 	groupWatchersLock sync.Mutex
-	groupWatchers     = func() map[string]*groupWatcher {
-		gws := make(map[string]*groupWatcher)
-		go groupWatchersCleaner(gws)
-		return gws
-	}()
+	groupWatchers     map[string]*groupWatcher
 
 	_ = metrics.NewGauge(`vm_promscrape_discovery_kubernetes_group_watchers`, func() float64 {
 		groupWatchersLock.Lock()
@@ -323,11 +325,16 @@ var (
 	})
 )
 
-func groupWatchersCleaner(gws map[string]*groupWatcher) {
+func init() {
+	groupWatchers = make(map[string]*groupWatcher)
+	go groupWatchersCleaner()
+}
+
+func groupWatchersCleaner() {
 	for {
 		time.Sleep(7 * time.Second)
 		groupWatchersLock.Lock()
-		for key, gw := range gws {
+		for key, gw := range groupWatchers {
 			gw.mu.Lock()
 			// Calculate the number of apiWatcher instances subscribed to gw.
 			awsTotal := 0
@@ -335,14 +342,14 @@ func groupWatchersCleaner(gws map[string]*groupWatcher) {
 				awsTotal += len(uw.aws) + len(uw.awsPending)
 			}
 
-			if awsTotal == 0 {
-				// There are no API watchers subscribed to gw.
-				// Stop all the urlWatcher instances at gw and drop gw from gws in this case,
+			if awsTotal == 0 && atomic.LoadInt32(&gw.apiWatcherInflightStartCalls) == 0 {
+				// There are no API watchers subscribed to gw and there are no in-flight apiWatcher.mustStart() calls.
+				// Stop all the urlWatcher instances at gw and drop gw from groupWatchers in this case,
 				// but do it only on the second iteration in order to reduce urlWatcher churn
 				// during scrape config reloads.
 				if gw.noAPIWatchers {
 					gw.cancel()
-					delete(gws, key)
+					delete(groupWatchers, key)
 				} else {
 					gw.noAPIWatchers = true
 				}
@@ -432,6 +439,7 @@ func (gw *groupWatcher) startWatchersForRole(role string, aw *apiWatcher) {
 	if gw.attachNodeMetadata && (role == "pod" || role == "endpoints" || role == "endpointslice") {
 		gw.startWatchersForRole("node", nil)
 	}
+
 	paths := getAPIPathsWithNamespaces(role, gw.namespaces, gw.selectors)
 	for _, path := range paths {
 		apiURL := gw.apiServer + path