mirror of
https://github.com/VictoriaMetrics/VictoriaMetrics.git
synced 2024-12-15 16:30:55 +01:00
lib/promscrape/discovery/kubernetes: fix watcher start order for roles endpoints and endpointslice (#5557)
* lib/promscrape/discovery/kubernetes: fix watcher start order for roles endpoints and endpointslice Previously the groupWatcher could be mistakenly stopped when requests for pod or services resources take too long. * remove mislead comment * docs/sd_configs.md: mention -promscrape.kubernetes.attachNodeMetadataAll flag in the description for attach_metadata section Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4640 * wip * lib/promscrape/kubernetes: prevent from stopping groupWatcher when there are in-flight apiWatcher.mustStart() calls groupWatcher is stopped if it has zero registered apiWatchers during 14 seconds. But such a groupWatcher can be still in use if apiWatcher for `role: endpoints` or `role: endpointslice` is being registered and the discovery of the associated `pod` and/or `service` objects takes longer than 14 seconds - see the beginning of groupWatcher.startWatchersForRole() function for details. Track the number of in-flight calls to apiWatcher.mustStart() and prevent from stopping the associated groupWatcher if the number of in-flight calls is non-zero. P.S. postponing the discovery of `pod` and/or `service` objects associated with `endpoints` or `endpointslice` roles isn't the best solution, since it slows down initial discovery of `endpoints` and `endpointslice` targets. * typo fix --------- Co-authored-by: Aliaksandr Valialkin <valyala@victoriametrics.com>
This commit is contained in:
parent
885ee160c2
commit
49fa92c1d0
@ -67,6 +67,7 @@ The sandbox cluster installation is running under the constant load generated by
|
|||||||
* BUGFIX: [vmctl](https://docs.victoriametrics.com/vmctl.html): retry on import errors in `vm-native` mode. Before, retries happened only on writes into a network connection between source and destination. But errors returned by server after all the data was transmitted were logged, but not retried.
|
* BUGFIX: [vmctl](https://docs.victoriametrics.com/vmctl.html): retry on import errors in `vm-native` mode. Before, retries happened only on writes into a network connection between source and destination. But errors returned by server after all the data was transmitted were logged, but not retried.
|
||||||
* BUGFIX: [vmagent](https://docs.victoriametrics.com/vmagent.html): properly assume role with [AWS IRSA authorization](https://docs.aws.amazon.com/eks/latest/userguide/iam-roles-for-service-accounts.html). Previously role chaining was not supported. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3822) for details.
|
* BUGFIX: [vmagent](https://docs.victoriametrics.com/vmagent.html): properly assume role with [AWS IRSA authorization](https://docs.aws.amazon.com/eks/latest/userguide/iam-roles-for-service-accounts.html). Previously role chaining was not supported. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3822) for details.
|
||||||
* BUGFIX: [vmagent](https://docs.victoriametrics.com/vmagent.html): exit if there is config syntax error in [`scrape_config_files`](https://docs.victoriametrics.com/vmagent.html#loading-scrape-configs-from-multiple-files) when `-promscrape.config.strictParse=true`. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5508).
|
* BUGFIX: [vmagent](https://docs.victoriametrics.com/vmagent.html): exit if there is config syntax error in [`scrape_config_files`](https://docs.victoriametrics.com/vmagent.html#loading-scrape-configs-from-multiple-files) when `-promscrape.config.strictParse=true`. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5508).
|
||||||
|
* BUGFIX: [vmagent](https://docs.victoriametrics.com/vmagent.html): properly discover targets for `role: endpoints` and `role: endpointslice` in [kubernetes_sd_configs](https://docs.victoriametrics.com/sd_configs.html#kubernetes_sd_configs). Previously some `endpoints` and `endpointslice` targets could be left undiscovered or some targets could have missing `__meta_*` labels when performing service discovery in busy Kubernetes clusters with large number of pods. See [this pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5557).
|
||||||
* BUGFIX: [vmagent](https://docs.victoriametrics.com/vmagent.html): do not store scrape response for target in memory when staleness markers are disabled. See [this pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5577) for details.
|
* BUGFIX: [vmagent](https://docs.victoriametrics.com/vmagent.html): do not store scrape response for target in memory when staleness markers are disabled. See [this pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5577) for details.
|
||||||
* BUGFIX: [vmui](https://docs.victoriametrics.com/#vmui): fix a link for the statistic inaccuracy explanation in the cardinality explorer tool. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5460).
|
* BUGFIX: [vmui](https://docs.victoriametrics.com/#vmui): fix a link for the statistic inaccuracy explanation in the cardinality explorer tool. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5460).
|
||||||
* BUGFIX: [vmui](https://docs.victoriametrics.com/#vmui): send `step` param for instant queries. The change reverts [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3896) due to reasons explained in [this comment](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3896#issuecomment-1896704401).
|
* BUGFIX: [vmui](https://docs.victoriametrics.com/#vmui): send `step` param for instant queries. The change reverts [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3896) due to reasons explained in [this comment](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3896#issuecomment-1896704401).
|
||||||
|
@ -103,7 +103,9 @@ func newAPIWatcher(apiServer string, ac *promauth.Config, sdc *SDConfig, swcFunc
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (aw *apiWatcher) mustStart() {
|
func (aw *apiWatcher) mustStart() {
|
||||||
|
atomic.AddInt32(&aw.gw.apiWatcherInflightStartCalls, 1)
|
||||||
aw.gw.startWatchersForRole(aw.role, aw)
|
aw.gw.startWatchersForRole(aw.role, aw)
|
||||||
|
atomic.AddInt32(&aw.gw.apiWatcherInflightStartCalls, -1)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (aw *apiWatcher) updateSwosCount(multiplier int, swosByKey map[string][]interface{}) {
|
func (aw *apiWatcher) updateSwosCount(multiplier int, swosByKey map[string][]interface{}) {
|
||||||
@ -209,6 +211,10 @@ func (aw *apiWatcher) getScrapeWorkObjects() []interface{} {
|
|||||||
// groupWatcher watches for Kubernetes objects on the given apiServer with the given namespaces,
|
// groupWatcher watches for Kubernetes objects on the given apiServer with the given namespaces,
|
||||||
// selectors and attachNodeMetadata using the given client.
|
// selectors and attachNodeMetadata using the given client.
|
||||||
type groupWatcher struct {
|
type groupWatcher struct {
|
||||||
|
// The number of in-flight apiWatcher.mustStart() calls for the given groupWatcher.
|
||||||
|
// This field is used by groupWatchersCleaner() in order to determine when the given groupWatcher can be stopped.
|
||||||
|
apiWatcherInflightStartCalls int32
|
||||||
|
|
||||||
// Old Kubernetes doesn't support /apis/networking.k8s.io/v1/, so /apis/networking.k8s.io/v1beta1/ must be used instead.
|
// Old Kubernetes doesn't support /apis/networking.k8s.io/v1/, so /apis/networking.k8s.io/v1beta1/ must be used instead.
|
||||||
// This flag is used for automatic substitution of v1 API path with v1beta1 API path during requests to apiServer.
|
// This flag is used for automatic substitution of v1 API path with v1beta1 API path during requests to apiServer.
|
||||||
useNetworkingV1Beta1 uint32
|
useNetworkingV1Beta1 uint32
|
||||||
@ -309,11 +315,7 @@ func selectorsKey(selectors []Selector) string {
|
|||||||
|
|
||||||
var (
|
var (
|
||||||
groupWatchersLock sync.Mutex
|
groupWatchersLock sync.Mutex
|
||||||
groupWatchers = func() map[string]*groupWatcher {
|
groupWatchers map[string]*groupWatcher
|
||||||
gws := make(map[string]*groupWatcher)
|
|
||||||
go groupWatchersCleaner(gws)
|
|
||||||
return gws
|
|
||||||
}()
|
|
||||||
|
|
||||||
_ = metrics.NewGauge(`vm_promscrape_discovery_kubernetes_group_watchers`, func() float64 {
|
_ = metrics.NewGauge(`vm_promscrape_discovery_kubernetes_group_watchers`, func() float64 {
|
||||||
groupWatchersLock.Lock()
|
groupWatchersLock.Lock()
|
||||||
@ -323,11 +325,16 @@ var (
|
|||||||
})
|
})
|
||||||
)
|
)
|
||||||
|
|
||||||
func groupWatchersCleaner(gws map[string]*groupWatcher) {
|
func init() {
|
||||||
|
groupWatchers = make(map[string]*groupWatcher)
|
||||||
|
go groupWatchersCleaner()
|
||||||
|
}
|
||||||
|
|
||||||
|
func groupWatchersCleaner() {
|
||||||
for {
|
for {
|
||||||
time.Sleep(7 * time.Second)
|
time.Sleep(7 * time.Second)
|
||||||
groupWatchersLock.Lock()
|
groupWatchersLock.Lock()
|
||||||
for key, gw := range gws {
|
for key, gw := range groupWatchers {
|
||||||
gw.mu.Lock()
|
gw.mu.Lock()
|
||||||
// Calculate the number of apiWatcher instances subscribed to gw.
|
// Calculate the number of apiWatcher instances subscribed to gw.
|
||||||
awsTotal := 0
|
awsTotal := 0
|
||||||
@ -335,14 +342,14 @@ func groupWatchersCleaner(gws map[string]*groupWatcher) {
|
|||||||
awsTotal += len(uw.aws) + len(uw.awsPending)
|
awsTotal += len(uw.aws) + len(uw.awsPending)
|
||||||
}
|
}
|
||||||
|
|
||||||
if awsTotal == 0 {
|
if awsTotal == 0 && atomic.LoadInt32(&gw.apiWatcherInflightStartCalls) == 0 {
|
||||||
// There are no API watchers subscribed to gw.
|
// There are no API watchers subscribed to gw and there are no in-flight apiWatcher.mustStart() calls.
|
||||||
// Stop all the urlWatcher instances at gw and drop gw from gws in this case,
|
// Stop all the urlWatcher instances at gw and drop gw from groupWatchers in this case,
|
||||||
// but do it only on the second iteration in order to reduce urlWatcher churn
|
// but do it only on the second iteration in order to reduce urlWatcher churn
|
||||||
// during scrape config reloads.
|
// during scrape config reloads.
|
||||||
if gw.noAPIWatchers {
|
if gw.noAPIWatchers {
|
||||||
gw.cancel()
|
gw.cancel()
|
||||||
delete(gws, key)
|
delete(groupWatchers, key)
|
||||||
} else {
|
} else {
|
||||||
gw.noAPIWatchers = true
|
gw.noAPIWatchers = true
|
||||||
}
|
}
|
||||||
@ -432,6 +439,7 @@ func (gw *groupWatcher) startWatchersForRole(role string, aw *apiWatcher) {
|
|||||||
if gw.attachNodeMetadata && (role == "pod" || role == "endpoints" || role == "endpointslice") {
|
if gw.attachNodeMetadata && (role == "pod" || role == "endpoints" || role == "endpointslice") {
|
||||||
gw.startWatchersForRole("node", nil)
|
gw.startWatchersForRole("node", nil)
|
||||||
}
|
}
|
||||||
|
|
||||||
paths := getAPIPathsWithNamespaces(role, gw.namespaces, gw.selectors)
|
paths := getAPIPathsWithNamespaces(role, gw.namespaces, gw.selectors)
|
||||||
for _, path := range paths {
|
for _, path := range paths {
|
||||||
apiURL := gw.apiServer + path
|
apiURL := gw.apiServer + path
|
||||||
|
Loading…
Reference in New Issue
Block a user