lib/promscrape: explicitly stop and cleanup service discovery routines when new config is read from -promscrape.config

This should reduce memory usage when `-promscrape.config` file frequently changes
This commit is contained in:
Aliaksandr Valialkin 2021-03-01 14:13:56 +02:00
parent e32ad9e923
commit 62ebf5c88e
14 changed files with 190 additions and 80 deletions

View File

@ -59,6 +59,15 @@ type Config struct {
baseDir string
}
func (cfg *Config) mustStop() {
startTime := time.Now()
logger.Infof("stopping service discovery routines...")
for i := range cfg.ScrapeConfigs {
cfg.ScrapeConfigs[i].mustStop()
}
logger.Infof("stopped service discovery routines in %.3f seconds", time.Since(startTime).Seconds())
}
// GlobalConfig represents essential parts for `global` section of Prometheus config.
//
// See https://prometheus.io/docs/prometheus/latest/configuration/configuration/
@ -91,7 +100,7 @@ type ScrapeConfig struct {
OpenStackSDConfigs []openstack.SDConfig `yaml:"openstack_sd_configs,omitempty"`
ConsulSDConfigs []consul.SDConfig `yaml:"consul_sd_configs,omitempty"`
EurekaSDConfigs []eureka.SDConfig `yaml:"eureka_sd_configs,omitempty"`
DockerSwarmConfigs []dockerswarm.SDConfig `yaml:"dockerswarm_sd_configs,omitempty"`
DockerSwarmSDConfigs []dockerswarm.SDConfig `yaml:"dockerswarm_sd_configs,omitempty"`
DNSSDConfigs []dns.SDConfig `yaml:"dns_sd_configs,omitempty"`
EC2SDConfigs []ec2.SDConfig `yaml:"ec2_sd_configs,omitempty"`
GCESDConfigs []gce.SDConfig `yaml:"gce_sd_configs,omitempty"`
@ -109,6 +118,33 @@ type ScrapeConfig struct {
swc *scrapeWorkConfig
}
func (sc *ScrapeConfig) mustStop() {
for i := range sc.KubernetesSDConfigs {
sc.KubernetesSDConfigs[i].MustStop()
}
for i := range sc.OpenStackSDConfigs {
sc.OpenStackSDConfigs[i].MustStop()
}
for i := range sc.ConsulSDConfigs {
sc.ConsulSDConfigs[i].MustStop()
}
for i := range sc.EurekaSDConfigs {
sc.EurekaSDConfigs[i].MustStop()
}
for i := range sc.DockerSwarmSDConfigs {
sc.DockerSwarmSDConfigs[i].MustStop()
}
for i := range sc.DNSSDConfigs {
sc.DNSSDConfigs[i].MustStop()
}
for i := range sc.EC2SDConfigs {
sc.EC2SDConfigs[i].MustStop()
}
for i := range sc.GCESDConfigs {
sc.GCESDConfigs[i].MustStop()
}
}
// FileSDConfig represents file-based service discovery config.
//
// See https://prometheus.io/docs/prometheus/latest/configuration/configuration/#file_sd_config
@ -259,8 +295,8 @@ func (cfg *Config) getDockerSwarmSDScrapeWork(prev []*ScrapeWork) []*ScrapeWork
sc := &cfg.ScrapeConfigs[i]
dstLen := len(dst)
ok := true
for j := range sc.DockerSwarmConfigs {
sdc := &sc.DockerSwarmConfigs[j]
for j := range sc.DockerSwarmSDConfigs {
sdc := &sc.DockerSwarmSDConfigs[j]
var okLocal bool
dst, okLocal = appendSDScrapeWork(dst, sdc, cfg.baseDir, sc.swc, "dockerswarm_sd_config")
if ok {

View File

@ -20,6 +20,10 @@ type apiConfig struct {
consulWatcher *consulWatcher
}
func (ac *apiConfig) mustStop() {
ac.consulWatcher.mustStop()
}
var configMap = discoveryutils.NewConfigMap()
func getAPIConfig(sdc *SDConfig, baseDir string) (*apiConfig, error) {

View File

@ -37,3 +37,13 @@ func (sdc *SDConfig) GetLabels(baseDir string) ([]map[string]string, error) {
ms := getServiceNodesLabels(cfg)
return ms, nil
}
// MustStop stops further usage for sdc.
func (sdc *SDConfig) MustStop() {
v := configMap.Delete(sdc)
if v != nil {
// v can be nil if GetLabels wasn't called yet.
cfg := v.(*apiConfig)
cfg.mustStop()
}
}

View File

@ -27,12 +27,12 @@ type consulWatcher struct {
watchServices []string
watchTags []string
// servicesLock protects services and servicesLastAccessTime
servicesLock sync.Mutex
services map[string]*serviceWatcher
servicesLastAccessTime time.Time
// servicesLock protects services
servicesLock sync.Mutex
services map[string]*serviceWatcher
wg sync.WaitGroup
wg sync.WaitGroup
stopCh chan struct{}
}
type serviceWatcher struct {
@ -55,18 +55,25 @@ func newConsulWatcher(client *discoveryutils.Client, sdc *SDConfig, datacenter s
serviceNodesQueryArgs += "&tag=" + url.QueryEscape(tag)
}
cw := &consulWatcher{
client: client,
serviceNamesQueryArgs: baseQueryArgs,
serviceNodesQueryArgs: serviceNodesQueryArgs,
watchServices: sdc.Services,
watchTags: sdc.Tags,
services: make(map[string]*serviceWatcher),
servicesLastAccessTime: time.Now(),
client: client,
serviceNamesQueryArgs: baseQueryArgs,
serviceNodesQueryArgs: serviceNodesQueryArgs,
watchServices: sdc.Services,
watchTags: sdc.Tags,
services: make(map[string]*serviceWatcher),
stopCh: make(chan struct{}),
}
go cw.watchForServicesUpdates()
return cw
}
func (cw *consulWatcher) mustStop() {
close(cw.stopCh)
// Do not wait for the watcher to stop, since it may take
// up to discoveryutils.BlockingClientReadTimeout to complete.
// TODO: add ability to cancel blocking requests.
}
// watchForServicesUpdates watches for new services and updates it in cw.
func (cw *consulWatcher) watchForServicesUpdates() {
checkInterval := getCheckInterval()
@ -129,13 +136,12 @@ func (cw *consulWatcher) watchForServicesUpdates() {
logger.Infof("started Consul service watcher for %q", clientAddr)
f()
for range ticker.C {
cw.servicesLock.Lock()
lastAccessTime := cw.servicesLastAccessTime
cw.servicesLock.Unlock()
if time.Since(lastAccessTime) > 3*checkInterval {
// The given cw is no longer used. Stop all service watchers and exit.
logger.Infof("starting to stop Consul service watchers for %q", clientAddr)
for {
select {
case <-ticker.C:
f()
case <-cw.stopCh:
logger.Infof("stopping Consul service watchers for %q", clientAddr)
startTime := time.Now()
cw.servicesLock.Lock()
for _, sw := range cw.services {
@ -146,8 +152,6 @@ func (cw *consulWatcher) watchForServicesUpdates() {
logger.Infof("stopped Consul service watcher for %q in %.3f seconds", clientAddr, time.Since(startTime).Seconds())
return
}
f()
}
}
@ -236,7 +240,6 @@ func (cw *consulWatcher) getServiceNodesSnapshot() []ServiceNode {
for _, sw := range cw.services {
sns = append(sns, sw.serviceNodes...)
}
cw.servicesLastAccessTime = time.Now()
cw.servicesLock.Unlock()
return sns
}

View File

@ -46,6 +46,11 @@ func (sdc *SDConfig) GetLabels(baseDir string) ([]map[string]string, error) {
}
}
// MustStop stops further usage for sdc.
func (sdc *SDConfig) MustStop() {
// nothing to do
}
func getSRVAddrLabels(ctx context.Context, sdc *SDConfig) []map[string]string {
type result struct {
name string

View File

@ -47,3 +47,8 @@ func (sdc *SDConfig) GetLabels(baseDir string) ([]map[string]string, error) {
return nil, fmt.Errorf("unexpected `role`: %q; must be one of `tasks`, `services` or `nodes`; skipping it", sdc.Role)
}
}
// MustStop stops further usage for sdc.
func (sdc *SDConfig) MustStop() {
configMap.Delete(sdc)
}

View File

@ -42,3 +42,8 @@ func (sdc *SDConfig) GetLabels(baseDir string) ([]map[string]string, error) {
}
return ms, nil
}
// MustStop stops further usage for sdc.
func (sdc *SDConfig) MustStop() {
configMap.Delete(sdc)
}

View File

@ -102,6 +102,11 @@ func (sdc *SDConfig) GetLabels(baseDir string) ([]map[string]string, error) {
return addInstanceLabels(apps, port), nil
}
// MustStop stops further usage for sdc.
func (sdc *SDConfig) MustStop() {
configMap.Delete(sdc)
}
func addInstanceLabels(apps *applications, port int) []map[string]string {
var ms []map[string]string
for _, app := range apps.Applications {

View File

@ -56,3 +56,8 @@ func (sdc *SDConfig) GetLabels(baseDir string) ([]map[string]string, error) {
ms := getInstancesLabels(cfg)
return ms, nil
}
// MustStop stops further usage for sdc.
func (sdc *SDConfig) MustStop() {
configMap.Delete(sdc)
}

View File

@ -1,6 +1,7 @@
package kubernetes
import (
"context"
"encoding/json"
"errors"
"flag"
@ -16,7 +17,6 @@ import (
"sync"
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/fasttime"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/promauth"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/promscrape/discoveryutils"
@ -30,6 +30,10 @@ type apiConfig struct {
aw *apiWatcher
}
func (ac *apiConfig) mustStop() {
ac.aw.mustStop()
}
var configMap = discoveryutils.NewConfigMap()
func getAPIConfig(sdc *SDConfig, baseDir string) (*apiConfig, error) {
@ -137,18 +141,24 @@ type apiWatcher struct {
// Selectors to apply during watch
selectors []Selector
// mu protects watchersByURL and lastAccessTime
// mu protects watchersByURL
mu sync.Mutex
// a map of watchers keyed by request paths
// a map of watchers keyed by request urls
watchersByURL map[string]*urlWatcher
// The last time the apiWatcher was queried for cached objects.
// It is used for stopping unused watchers.
lastAccessTime uint64
stopFunc func()
stopCtx context.Context
wg sync.WaitGroup
}
func (aw *apiWatcher) mustStop() {
aw.stopFunc()
aw.wg.Wait()
}
func newAPIWatcher(client *http.Client, apiServer, authorization string, namespaces []string, selectors []Selector) *apiWatcher {
stopCtx, stopFunc := context.WithCancel(context.Background())
return &apiWatcher{
apiServer: apiServer,
authorization: authorization,
@ -158,7 +168,8 @@ func newAPIWatcher(client *http.Client, apiServer, authorization string, namespa
watchersByURL: make(map[string]*urlWatcher),
lastAccessTime: fasttime.UnixTimestamp(),
stopFunc: stopFunc,
stopCtx: stopCtx,
}
}
@ -177,7 +188,6 @@ func (aw *apiWatcher) getLabelsForRole(role string) []map[string]string {
}
uw.mu.Unlock()
}
aw.lastAccessTime = fasttime.UnixTimestamp()
aw.mu.Unlock()
return ms
}
@ -202,7 +212,6 @@ func (aw *apiWatcher) getObjectByRole(role, namespace, name string) object {
break
}
}
aw.lastAccessTime = fasttime.UnixTimestamp()
aw.mu.Unlock()
return o
}
@ -230,8 +239,12 @@ func (aw *apiWatcher) startWatcherForURL(role, apiURL string, parseObject parseO
uw.watchersCount.Inc()
uw.watchersCreated.Inc()
resourceVersion := uw.reloadObjects()
aw.wg.Add(1)
go func() {
defer aw.wg.Done()
logger.Infof("started watcher for %q", apiURL)
uw.watchForUpdates(resourceVersion)
logger.Infof("stopped watcher for %q", apiURL)
uw.mu.Lock()
uw.objectsCount.Add(-len(uw.objectsByKey))
uw.objectsRemoved.Add(len(uw.objectsByKey))
@ -245,16 +258,19 @@ func (aw *apiWatcher) startWatcherForURL(role, apiURL string, parseObject parseO
}()
}
// needStop returns true if aw wasn't used for long time.
// needStop returns true if aw must be stopped.
func (aw *apiWatcher) needStop() bool {
aw.mu.Lock()
defer aw.mu.Unlock()
return fasttime.UnixTimestamp() > aw.lastAccessTime+5*60
select {
case <-aw.stopCtx.Done():
return true
default:
return false
}
}
// doRequest performs http request to the given requestURL.
func (aw *apiWatcher) doRequest(requestURL string) (*http.Response, error) {
req, err := http.NewRequest("GET", requestURL, nil)
req, err := http.NewRequestWithContext(aw.stopCtx, "GET", requestURL, nil)
if err != nil {
logger.Fatalf("cannot create a request for %q: %s", requestURL, err)
}
@ -316,10 +332,13 @@ func (aw *apiWatcher) newURLWatcher(role, apiURL string, parseObject parseObject
// reloadObjects reloads objects to the latest state and returns resourceVersion for the latest state.
func (uw *urlWatcher) reloadObjects() string {
aw := uw.aw
requestURL := uw.apiURL
resp, err := uw.aw.doRequest(requestURL)
resp, err := aw.doRequest(requestURL)
if err != nil {
logger.Errorf("error when performing a request to %q: %s", requestURL, err)
if !aw.needStop() {
logger.Errorf("error when performing a request to %q: %s", requestURL, err)
}
return ""
}
body, _ := ioutil.ReadAll(resp.Body)
@ -330,12 +349,14 @@ func (uw *urlWatcher) reloadObjects() string {
}
objectsByKey, metadata, err := uw.parseObjectList(body)
if err != nil {
logger.Errorf("cannot parse response from %q: %s", requestURL, err)
if !aw.needStop() {
logger.Errorf("cannot parse response from %q: %s", requestURL, err)
}
return ""
}
labelsByKey := make(map[string][]map[string]string, len(objectsByKey))
for k, o := range objectsByKey {
labelsByKey[k] = o.getTargetLabels(uw.aw)
labelsByKey[k] = o.getTargetLabels(aw)
}
uw.mu.Lock()
uw.objectsRemoved.Add(-len(uw.objectsByKey))
@ -368,10 +389,8 @@ func (uw *urlWatcher) watchForUpdates(resourceVersion string) {
}
timeoutSeconds := time.Duration(0.9 * float64(aw.client.Timeout)).Seconds()
apiURL += delimiter + "watch=1&timeoutSeconds=" + strconv.Itoa(int(timeoutSeconds))
logger.Infof("started watcher for %q", apiURL)
for {
if aw.needStop() {
logger.Infof("stopped unused watcher for %q", apiURL)
return
}
requestURL := apiURL
@ -380,6 +399,9 @@ func (uw *urlWatcher) watchForUpdates(resourceVersion string) {
}
resp, err := aw.doRequest(requestURL)
if err != nil {
if aw.needStop() {
return
}
logger.Errorf("error when performing a request to %q: %s", requestURL, err)
backoffSleep()
resourceVersion = uw.reloadObjects()
@ -402,6 +424,9 @@ func (uw *urlWatcher) watchForUpdates(resourceVersion string) {
err = uw.readObjectUpdateStream(resp.Body)
_ = resp.Body.Close()
if err != nil {
if aw.needStop() {
return
}
if !errors.Is(err, io.EOF) {
logger.Errorf("error when reading WatchEvent stream from %q: %s", requestURL, err)
}

View File

@ -50,3 +50,13 @@ func (sdc *SDConfig) GetLabels(baseDir string) ([]map[string]string, error) {
return nil, fmt.Errorf("unexpected `role`: %q; must be one of `node`, `pod`, `service`, `endpoints`, `endpointslices` or `ingress`; skipping it", sdc.Role)
}
}
// MustStop stops further usage for sdc.
func (sdc *SDConfig) MustStop() {
v := configMap.Delete(sdc)
if v != nil {
// v can be nil if GetLabels wasn't called yet.
cfg := v.(*apiConfig)
cfg.mustStop()
}
}

View File

@ -46,3 +46,8 @@ func (sdc *SDConfig) GetLabels(baseDir string) ([]map[string]string, error) {
return nil, fmt.Errorf("unexpected `role`: %q; must be one of `instance` or `hypervisor`; skipping it", sdc.Role)
}
}
// MustStop stops further usage for sdc.
func (sdc *SDConfig) MustStop() {
configMap.Delete(sdc)
}

View File

@ -2,24 +2,25 @@ package discoveryutils
import (
"sync"
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/fasttime"
"github.com/VictoriaMetrics/metrics"
)
// ConfigMap is a map for storing discovery api configs.
//
// It automatically removes old configs which weren't accessed recently.
type ConfigMap struct {
mu sync.Mutex
m map[interface{}]*configMapEntry
hasCleaner bool
mu sync.Mutex
m map[interface{}]interface{}
entriesCount *metrics.Counter
}
// NewConfigMap creates ConfigMap
func NewConfigMap() *ConfigMap {
return &ConfigMap{
m: make(map[interface{}]*configMapEntry),
m: make(map[interface{}]interface{}),
entriesCount: metrics.GetOrCreateCounter(`vm_promscrape_discoveryutils_configmap_entries_count`),
}
}
@ -32,42 +33,30 @@ func (cm *ConfigMap) Get(key interface{}, newConfig func() (interface{}, error))
cm.mu.Lock()
defer cm.mu.Unlock()
if !cm.hasCleaner {
cm.hasCleaner = true
go cm.cleaner()
}
e := cm.m[key]
if e != nil {
e.lastAccessTime = fasttime.UnixTimestamp()
return e.cfg, nil
cfg := cm.m[key]
if cfg != nil {
return cfg, nil
}
cfg, err := newConfig()
if err != nil {
return nil, err
}
cm.m[key] = &configMapEntry{
cfg: cfg,
lastAccessTime: fasttime.UnixTimestamp(),
}
cm.m[key] = cfg
cm.entriesCount.Inc()
return cfg, nil
}
func (cm *ConfigMap) cleaner() {
tc := time.NewTicker(15 * time.Minute)
for range tc.C {
currentTime := fasttime.UnixTimestamp()
cm.mu.Lock()
for k, e := range cm.m {
if currentTime-e.lastAccessTime > 10*60 {
delete(cm.m, k)
}
}
cm.mu.Unlock()
}
}
// Delete deletes config for the given key from cm and returns it.
func (cm *ConfigMap) Delete(key interface{}) interface{} {
cm.mu.Lock()
defer cm.mu.Unlock()
type configMapEntry struct {
cfg interface{}
lastAccessTime uint64
cfg := cm.m[key]
if cfg == nil {
// The cfg can be missing if it wasn't accessed yet.
return nil
}
cm.entriesCount.Dec()
delete(cm.m, key)
return cfg
}

View File

@ -129,6 +129,7 @@ func runScraper(configFile string, pushData func(wr *prompbmarshal.WriteRequest)
logger.Infof("nothing changed in %q", configFile)
goto waitForChans
}
cfg.mustStop()
cfg = cfgNew
data = dataNew
case <-tickerCh:
@ -141,9 +142,11 @@ func runScraper(configFile string, pushData func(wr *prompbmarshal.WriteRequest)
// Nothing changed since the previous loadConfig
goto waitForChans
}
cfg.mustStop()
cfg = cfgNew
data = dataNew
case <-globalStopCh:
cfg.mustStop()
logger.Infof("stopping Prometheus scrapers")
startTime := time.Now()
scs.stop()