From d0a9b24c5acf7b7bba69e37109003a34ee63484d Mon Sep 17 00:00:00 2001
From: Nikolay <nik@victoriametrics.com>
Date: Wed, 4 Nov 2020 18:03:43 +0300
Subject: [PATCH] reduces memory usage for vmagent, (#880)

* reduces memory usage for vmagent,
limits count of droppedTarget, that can be stored for /api/v1/targets page up to 999 items,
https://github.com/VictoriaMetrics/VictoriaMetrics/issues/878

* Update app/vmagent/README.md

* Update app/vmagent/README.md

Co-authored-by: Aliaksandr Valialkin <valyala@gmail.com>
---
 app/vmagent/README.md          |  3 +++
 docs/vmagent.md                |  6 ++++++
 lib/promscrape/config.go       |  3 +++
 lib/promscrape/targetstatus.go | 12 +++++++++---
 4 files changed, 21 insertions(+), 3 deletions(-)

diff --git a/app/vmagent/README.md b/app/vmagent/README.md
index 22d2c64030..6d132ad5a5 100644
--- a/app/vmagent/README.md
+++ b/app/vmagent/README.md
@@ -231,6 +231,9 @@ This information may be useful for debugging target relabeling.
   by passing `-promscrape.suppressScrapeErrors` command-line flag to `vmagent`. The most recent scrape error per each target can be observed at `http://vmagent-host:8429/targets`
   and `http://vmagent-host:8429/api/v1/targets`.
 
+* The `/api/v1/targets` page could be useful for debugging relabeling process for scrape targets.
+  This page contains original labels for targets dropped during relabeling (see "droppedTargets" section in the page output). By default up to `-promscrape.maxDroppedTargets` targets are shown here. If your setup drops more targets during relabeling, then increase `-promscrape.maxDroppedTargets` command-line flag value in order to see all the dropped targets. Note that tracking each dropped target requires up to 10Kb of RAM, so big values for `-promscrape.maxDroppedTargets` may result in increased memory usage if big number of scrape targets are dropped during relabeling.
+
 * If `vmagent` scrapes targets with millions of metrics per each target (for instance, when scraping [federation endpoints](https://prometheus.io/docs/prometheus/latest/federation/)),
   then it is recommended enabling `stream parsing mode` in order to reduce memory usage during scraping. This mode may be enabled either globally for all the scrape targets
   by passing `-promscrape.streamParse` command-line flag or on a per-scrape target basis with `stream_parse: true` option. For example:
diff --git a/docs/vmagent.md b/docs/vmagent.md
index 22d2c64030..b947820b98 100644
--- a/docs/vmagent.md
+++ b/docs/vmagent.md
@@ -231,6 +231,12 @@ This information may be useful for debugging target relabeling.
   by passing `-promscrape.suppressScrapeErrors` command-line flag to `vmagent`. The most recent scrape error per each target can be observed at `http://vmagent-host:8429/targets`
   and `http://vmagent-host:8429/api/v1/targets`.
 
+* For debugging scrapes targets relabeling process, you can observe status of targets at `/api/v1/targets` page.
+  You may need to increase value for the flag `--promscrape.maxDroppedTargets`.  
+  Its needed, when `vmagent` scrapes thousands of targets, and some targets dropped during relabeling.
+  It requires additional memory for storing those targets (up to 10kb per target).
+  With service_discovery services, like kubernetes, it may be around 10 000 dropped scrape targets per 100 active scrape targets.
+
 * If `vmagent` scrapes targets with millions of metrics per each target (for instance, when scraping [federation endpoints](https://prometheus.io/docs/prometheus/latest/federation/)),
   then it is recommended enabling `stream parsing mode` in order to reduce memory usage during scraping. This mode may be enabled either globally for all the scrape targets
   by passing `-promscrape.streamParse` command-line flag or on a per-scrape target basis with `stream_parse: true` option. For example:
diff --git a/lib/promscrape/config.go b/lib/promscrape/config.go
index d8ca914699..f61e3d8cdf 100644
--- a/lib/promscrape/config.go
+++ b/lib/promscrape/config.go
@@ -32,6 +32,9 @@ var (
 	dryRun = flag.Bool("promscrape.config.dryRun", false, "Checks -promscrape.config file for errors and unsupported fields and then exits. "+
 		"Returns non-zero exit code on parsing errors and emits these errors to stderr. "+
 		"Pass -loggerLevel=ERROR if you don't need to see info messages in the output")
+	maxDroppedTargets = flag.Int("promscrape.maxDroppedTargets", 1000, "Defines how many targets, dropped during service discovery,"+
+		" can be stored for  /api/v1/targets page. "+
+		"This value may be increased for debugging cause of dropping targets during service discovery relabeling")
 )
 
 // Config represents essential parts from Prometheus config defined at https://prometheus.io/docs/prometheus/latest/configuration/configuration/
diff --git a/lib/promscrape/targetstatus.go b/lib/promscrape/targetstatus.go
index 48240368c5..dd24f2495e 100644
--- a/lib/promscrape/targetstatus.go
+++ b/lib/promscrape/targetstatus.go
@@ -240,12 +240,18 @@ type droppedTarget struct {
 }
 
 func (dt *droppedTargets) Register(originalLabels []prompbmarshal.Label) {
+
 	key := promLabelsString(originalLabels)
 	currentTime := fasttime.UnixTimestamp()
 	dt.mu.Lock()
-	dt.m[key] = droppedTarget{
-		originalLabels: originalLabels,
-		deadline:       currentTime + 10*60,
+	if k, ok := dt.m[key]; ok {
+		k.deadline = currentTime + 10*60
+		dt.m[key] = k
+	} else if len(dt.m) < *maxDroppedTargets {
+		dt.m[key] = droppedTarget{
+			originalLabels: originalLabels,
+			deadline:       currentTime + 10*60,
+		}
 	}
 	if currentTime-dt.lastCleanupTime > 60 {
 		for k, v := range dt.m {