From 5138eaeea0791caa34bcfab410e0ca9cd253cd8f Mon Sep 17 00:00:00 2001
From: Aliaksandr Valialkin <valyala@victoriametrics.com>
Date: Sat, 8 Oct 2022 01:07:42 +0300
Subject: [PATCH] app/vmselect: allow limiting per-query memory usage via
 -search.maxMemoryPerQuery command-line flag

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3203
---
 README.md                                  |  3 +-
 app/vmselect/main.go                       |  1 +
 app/vmselect/promql/eval.go                | 57 ++++++++++++++--------
 app/vmselect/promql/memory_limiter.go      | 33 -------------
 app/vmselect/promql/memory_limiter_test.go | 56 ---------------------
 docs/CHANGELOG.md                          |  1 +
 docs/Cluster-VictoriaMetrics.md            |  3 +-
 docs/README.md                             |  3 +-
 docs/Single-server-VictoriaMetrics.md      |  3 +-
 9 files changed, 46 insertions(+), 114 deletions(-)
 delete mode 100644 app/vmselect/promql/memory_limiter.go
 delete mode 100644 app/vmselect/promql/memory_limiter_test.go

diff --git a/README.md b/README.md
index f8ed2a392..50057b0fe 100644
--- a/README.md
+++ b/README.md
@@ -1263,7 +1263,8 @@ See also [resource usage limits docs](#resource-usage-limits).
 
 By default VictoriaMetrics is tuned for an optimal resource usage under typical workloads. Some workloads may need fine-grained resource usage limits. In these cases the following command-line flags may be useful:
 
-- `-memory.allowedPercent` and `-search.allowedBytes` limit the amounts of memory, which may be used for various internal caches at VictoriaMetrics. Note that VictoriaMetrics may use more memory, since these flags don't limit additional memory, which may be needed on a per-query basis.
+- `-memory.allowedPercent` and `-memory.allowedBytes` limit the amounts of memory, which may be used for various internal caches at VictoriaMetrics. Note that VictoriaMetrics may use more memory, since these flags don't limit additional memory, which may be needed on a per-query basis.
+- `-search.maxMemoryPerQuery` limits the amounts of memory, which can be used for processing a single query. Queries, which need more memory, are rejected. By default this limit is calculated by dividing `-search.allowedPercent` by `-search.maxConcurrentRequests`. Sometimes a heavy query, which selects big number of time series, may exceed the per-query memory limit by a small percent. The total memory limit for concurrently executed queries can be estimated as `-search.maxMemoryPerQuery` multiplied by `-search.maxConcurrentRequests`.
 - `-search.maxUniqueTimeseries` limits the number of unique time series a single query can find and process. VictoriaMetrics keeps in memory some metainformation about the time series located by each query and spends some CPU time for processing the found time series. This means that the maximum memory usage and CPU usage a single query can use is proportional to `-search.maxUniqueTimeseries`.
 - `-search.maxQueryDuration` limits the duration of a single query. If the query takes longer than the given duration, then it is canceled. This allows saving CPU and RAM when executing unexpected heavy queries.
 - `-search.maxConcurrentRequests` limits the number of concurrent requests VictoriaMetrics can process. Bigger number of concurrent requests usually means bigger memory usage. For example, if a single query needs 100 MiB of additional memory during its execution, then 100 concurrent queries may need `100 * 100 MiB = 10 GiB` of additional memory. So it is better to limit the number of concurrent queries, while suspending additional incoming queries if the concurrency limit is reached. VictoriaMetrics provides `-search.maxQueueDuration` command-line flag for limiting the max wait time for suspended queries.
diff --git a/app/vmselect/main.go b/app/vmselect/main.go
index 9363a966d..f61899446 100644
--- a/app/vmselect/main.go
+++ b/app/vmselect/main.go
@@ -59,6 +59,7 @@ func Init() {
 	fs.RemoveDirContents(tmpDirPath)
 	netstorage.InitTmpBlocksDir(tmpDirPath)
 	promql.InitRollupResultCache(*vmstorage.DataPath + "/cache/rollupResult")
+	promql.InitMaxMemoryPerQuery(*maxConcurrentRequests)
 
 	concurrencyCh = make(chan struct{}, *maxConcurrentRequests)
 	initVMAlertProxy()
diff --git a/app/vmselect/promql/eval.go b/app/vmselect/promql/eval.go
index 1b1b860a0..137e8fd0d 100644
--- a/app/vmselect/promql/eval.go
+++ b/app/vmselect/promql/eval.go
@@ -15,6 +15,7 @@ import (
 	"github.com/VictoriaMetrics/VictoriaMetrics/lib/bytesutil"
 	"github.com/VictoriaMetrics/VictoriaMetrics/lib/cgroup"
 	"github.com/VictoriaMetrics/VictoriaMetrics/lib/decimal"
+	"github.com/VictoriaMetrics/VictoriaMetrics/lib/flagutil"
 	"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
 	"github.com/VictoriaMetrics/VictoriaMetrics/lib/memory"
 	"github.com/VictoriaMetrics/VictoriaMetrics/lib/querytracer"
@@ -27,7 +28,12 @@ var (
 	disableCache                   = flag.Bool("search.disableCache", false, "Whether to disable response caching. This may be useful during data backfilling")
 	maxPointsSubqueryPerTimeseries = flag.Int("search.maxPointsSubqueryPerTimeseries", 100e3, "The maximum number of points per series, which can be generated by subquery. "+
 		"See https://valyala.medium.com/prometheus-subqueries-in-victoriametrics-9b1492b720b3")
-	noStaleMarkers = flag.Bool("search.noStaleMarkers", false, "Set this flag to true if the database doesn't contain Prometheus stale markers, so there is no need in spending additional CPU time on its handling. Staleness markers may exist only in data obtained from Prometheus scrape targets")
+	maxMemoryPerQuery = flagutil.NewBytes("search.maxMemoryPerQuery", 0, "The maximum amounts of memory a single query may consume. "+
+		"Queries requiring more memory are rejected. The total memory limit for concurrently executed queries can be estimated as "+
+		"-search.maxMemoryPerQuery multiplied by -search.maxConcurrentRequests . "+
+		"If the -search.maxMemoryPerQuery isn't set, then it is automatically calculated by dividing -memory.allowedPercent by -search.maxConcurrentRequests")
+	noStaleMarkers = flag.Bool("search.noStaleMarkers", false, "Set this flag to true if the database doesn't contain Prometheus stale markers, "+
+		"so there is no need in spending additional CPU time on its handling. Staleness markers may exist only in data obtained from Prometheus scrape targets")
 )
 
 // The minimum number of points per timeseries for enabling time rounding.
@@ -1051,20 +1057,18 @@ func evalRollupFuncWithMetricExpr(qt *querytracer.Tracer, ec *EvalConfig, funcNa
 		}
 	}
 	rollupPoints := mulNoOverflow(pointsPerTimeseries, int64(timeseriesLen*len(rcs)))
-	rollupMemorySize = mulNoOverflow(rollupPoints, 16)
-	rml := getRollupMemoryLimiter()
-	if !rml.Get(uint64(rollupMemorySize)) {
+	rollupMemorySize = sumNoOverflow(mulNoOverflow(int64(rssLen), 1000), mulNoOverflow(rollupPoints, 16))
+	maxMemory := getMaxMemoryPerQuery()
+	if rollupMemorySize > maxMemory {
 		rss.Cancel()
 		return nil, &UserReadableError{
-			Err: fmt.Errorf("not enough memory for processing %d data points across %d time series with %d points in each time series; "+
-				"total available memory for concurrent requests: %d bytes; "+
-				"requested memory: %d bytes; "+
-				"possible solutions are: reducing the number of matching time series; switching to node with more RAM; "+
-				"increasing -memory.allowedPercent; increasing `step` query arg (%gs)",
-				rollupPoints, timeseriesLen*len(rcs), pointsPerTimeseries, rml.MaxSize, uint64(rollupMemorySize), float64(ec.Step)/1e3),
+			Err: fmt.Errorf("not enough memory for processing %d data points across %d time series with %d points in each time series "+
+				"according to -search.maxMemoryPerQuery=%d; requested memory: %d bytes; "+
+				"possible solutions are: reducing the number of matching time series; increasing -search.maxMemoryPerQuery; "+
+				"increasing `step` query arg (%gs)",
+				rollupPoints, timeseriesLen*len(rcs), pointsPerTimeseries, maxMemory, rollupMemorySize, float64(ec.Step)/1e3),
 		}
 	}
-	defer rml.Put(uint64(rollupMemorySize))
 
 	// Evaluate rollup
 	keepMetricNames := getKeepMetricNames(expr)
@@ -1084,18 +1088,21 @@ func evalRollupFuncWithMetricExpr(qt *querytracer.Tracer, ec *EvalConfig, funcNa
 	return tss, nil
 }
 
-var (
-	rollupMemoryLimiter     memoryLimiter
-	rollupMemoryLimiterOnce sync.Once
-)
-
-func getRollupMemoryLimiter() *memoryLimiter {
-	rollupMemoryLimiterOnce.Do(func() {
-		rollupMemoryLimiter.MaxSize = uint64(memory.Allowed()) / 4
-	})
-	return &rollupMemoryLimiter
+func getMaxMemoryPerQuery() int64 {
+	if n := maxMemoryPerQuery.N; n > 0 {
+		return int64(n)
+	}
+	return maxMemoryPerQueryDefault
 }
 
+// InitMaxMemoryPerQuery must be called after flag.Parse and before promql usage.
+func InitMaxMemoryPerQuery(maxConcurrentRequests int) {
+	n := int(0.8*float64(memory.Allowed())) / maxConcurrentRequests
+	maxMemoryPerQueryDefault = int64(n)
+}
+
+var maxMemoryPerQueryDefault int64
+
 func evalRollupWithIncrementalAggregate(qt *querytracer.Tracer, funcName string, keepMetricNames bool,
 	iafc *incrementalAggrFuncContext, rss *netstorage.Results, rcs []*rollupConfig,
 	preFunc func(values []float64, timestamps []int64), sharedTimestamps []int64) ([]*timeseries, error) {
@@ -1227,6 +1234,14 @@ func mulNoOverflow(a, b int64) int64 {
 	return a * b
 }
 
+func sumNoOverflow(a, b int64) int64 {
+	if math.MaxInt64-a < b {
+		// Overflow
+		return math.MaxInt64
+	}
+	return a + b
+}
+
 func dropStaleNaNs(funcName string, values []float64, timestamps []int64) ([]float64, []int64) {
 	if *noStaleMarkers || funcName == "default_rollup" || funcName == "stale_samples_over_time" {
 		// Do not drop Prometheus staleness marks (aka stale NaNs) for default_rollup() function,
diff --git a/app/vmselect/promql/memory_limiter.go b/app/vmselect/promql/memory_limiter.go
deleted file mode 100644
index e9a76b143..000000000
--- a/app/vmselect/promql/memory_limiter.go
+++ /dev/null
@@ -1,33 +0,0 @@
-package promql
-
-import (
-	"sync"
-
-	"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
-)
-
-type memoryLimiter struct {
-	MaxSize uint64
-
-	mu    sync.Mutex
-	usage uint64
-}
-
-func (ml *memoryLimiter) Get(n uint64) bool {
-	ml.mu.Lock()
-	ok := n <= ml.MaxSize && ml.MaxSize-n >= ml.usage
-	if ok {
-		ml.usage += n
-	}
-	ml.mu.Unlock()
-	return ok
-}
-
-func (ml *memoryLimiter) Put(n uint64) {
-	ml.mu.Lock()
-	if n > ml.usage {
-		logger.Panicf("BUG: n=%d cannot exceed %d", n, ml.usage)
-	}
-	ml.usage -= n
-	ml.mu.Unlock()
-}
diff --git a/app/vmselect/promql/memory_limiter_test.go b/app/vmselect/promql/memory_limiter_test.go
deleted file mode 100644
index 4477678e4..000000000
--- a/app/vmselect/promql/memory_limiter_test.go
+++ /dev/null
@@ -1,56 +0,0 @@
-package promql
-
-import (
-	"testing"
-)
-
-func TestMemoryLimiter(t *testing.T) {
-	var ml memoryLimiter
-	ml.MaxSize = 100
-
-	// Allocate memory
-	if !ml.Get(10) {
-		t.Fatalf("cannot get 10 out of %d bytes", ml.MaxSize)
-	}
-	if ml.usage != 10 {
-		t.Fatalf("unexpected usage; got %d; want %d", ml.usage, 10)
-	}
-	if !ml.Get(20) {
-		t.Fatalf("cannot get 20 out of 90 bytes")
-	}
-	if ml.usage != 30 {
-		t.Fatalf("unexpected usage; got %d; want %d", ml.usage, 30)
-	}
-	if ml.Get(1000) {
-		t.Fatalf("unexpected get for 1000 bytes")
-	}
-	if ml.usage != 30 {
-		t.Fatalf("unexpected usage; got %d; want %d", ml.usage, 30)
-	}
-	if ml.Get(71) {
-		t.Fatalf("unexpected get for 71 bytes")
-	}
-	if ml.usage != 30 {
-		t.Fatalf("unexpected usage; got %d; want %d", ml.usage, 30)
-	}
-	if !ml.Get(70) {
-		t.Fatalf("cannot get 70 bytes")
-	}
-	if ml.usage != 100 {
-		t.Fatalf("unexpected usage; got %d; want %d", ml.usage, 100)
-	}
-
-	// Return memory back
-	ml.Put(10)
-	ml.Put(70)
-	if ml.usage != 20 {
-		t.Fatalf("unexpected usage; got %d; want %d", ml.usage, 20)
-	}
-	if !ml.Get(30) {
-		t.Fatalf("cannot get 30 bytes")
-	}
-	ml.Put(50)
-	if ml.usage != 0 {
-		t.Fatalf("unexpected usage; got %d; want %d", ml.usage, 0)
-	}
-}
diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md
index 71247d2f8..7ee5ce93c 100644
--- a/docs/CHANGELOG.md
+++ b/docs/CHANGELOG.md
@@ -15,6 +15,7 @@ The following tip changes can be tested by building VictoriaMetrics components f
 
 ## tip
 
+* FEATURE: allow limiting memory usage on a per-query basis with `-search.maxMemoryPerQuery` command-line flag. See [this feature request](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3203).
 * FEATURE: [vmagent](https://docs.victoriametrics.com/vmagent.html): drop all the labels with `__` prefix from discovered targets in the same way as Prometheus does according to [this article](https://www.robustperception.io/life-of-a-label/). Previously the following labels were available during [metric-level relabeling](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#metric_relabel_configs): `__address__`, `__scheme__`, `__metrics_path__`, `__scrape_interval__`, `__scrape_timeout__`, `__param_*`. Now these labels are available only during [target-level relabeling](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#relabel_config). This should reduce CPU usage and memory usage for `vmagent` setups, which scrape big number of targets.
 * FEATURE: [vmagent](https://docs.victoriametrics.com/vmagent.html): allow specifying full url in scrape target addresses (aka `__address__` label). This makes valid the following `-promscrape.config`:
 
diff --git a/docs/Cluster-VictoriaMetrics.md b/docs/Cluster-VictoriaMetrics.md
index abe9c8a31..67333769c 100644
--- a/docs/Cluster-VictoriaMetrics.md
+++ b/docs/Cluster-VictoriaMetrics.md
@@ -469,7 +469,8 @@ See also [resource usage limits docs](#resource-usage-limits).
 
 By default cluster components of VictoriaMetrics are tuned for an optimal resource usage under typical workloads. Some workloads may need fine-grained resource usage limits. In these cases the following command-line flags may be useful:
 
-- `-memory.allowedPercent` and `-search.allowedBytes` limit the amounts of memory, which may be used for various internal caches at all the cluster components of VictoriaMetrics - `vminsert`, `vmselect` and `vmstorage`. Note that VictoriaMetrics components may use more memory, since these flags don't limit additional memory, which may be needed on a per-query basis.
+- `-memory.allowedPercent` and `-memory.allowedBytes` limit the amounts of memory, which may be used for various internal caches at all the cluster components of VictoriaMetrics - `vminsert`, `vmselect` and `vmstorage`. Note that VictoriaMetrics components may use more memory, since these flags don't limit additional memory, which may be needed on a per-query basis.
+- `-search.maxMemoryPerQuery` limits the amounts of memory, which can be used for processing a single query at `vmselect` node. Queries, which need more memory, are rejected. By default this limit is calculated by dividing `-search.allowedPercent` by `-search.maxConcurrentRequests`. Sometimes a heavy query, which selects big number of time series, may exceed the per-query memory limit by a small percent. The total memory limit for concurrently executed queries can be estimated as `-search.maxMemoryPerQuery` multiplied by `-search.maxConcurrentRequests`.
 - `-search.maxUniqueTimeseries` at `vmselect` component limits the number of unique time series a single query can find and process. `vmselect` passes the limit to `vmstorage` component, which keeps in memory some metainformation about the time series located by each query and spends some CPU time for processing the found time series. This means that the maximum memory usage and CPU usage a single query can use at `vmstorage` is proportional to `-search.maxUniqueTimeseries`.
 - `-search.maxQueryDuration` at `vmselect` limits the duration of a single query. If the query takes longer than the given duration, then it is canceled. This allows saving CPU and RAM at `vmselect` and `vmstorage` when executing unexpected heavy queries.
 - `-search.maxConcurrentRequests` at `vmselect` limits the number of concurrent requests a single `vmselect` node can process. Bigger number of concurrent requests usually means bigger memory usage at both `vmselect` and `vmstorage`. For example, if a single query needs 100 MiB of additional memory during its execution, then 100 concurrent queries may need `100 * 100 MiB = 10 GiB` of additional memory. So it is better to limit the number of concurrent queries, while suspending additional incoming queries if the concurrency limit is reached. `vmselect` provides `-search.maxQueueDuration` command-line flag for limiting the max wait time for suspended queries.
diff --git a/docs/README.md b/docs/README.md
index fae53c21f..4948bd7bd 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -1264,7 +1264,8 @@ See also [resource usage limits docs](#resource-usage-limits).
 
 By default VictoriaMetrics is tuned for an optimal resource usage under typical workloads. Some workloads may need fine-grained resource usage limits. In these cases the following command-line flags may be useful:
 
-- `-memory.allowedPercent` and `-search.allowedBytes` limit the amounts of memory, which may be used for various internal caches at VictoriaMetrics. Note that VictoriaMetrics may use more memory, since these flags don't limit additional memory, which may be needed on a per-query basis.
+- `-memory.allowedPercent` and `-memory.allowedBytes` limit the amounts of memory, which may be used for various internal caches at VictoriaMetrics. Note that VictoriaMetrics may use more memory, since these flags don't limit additional memory, which may be needed on a per-query basis.
+- `-search.maxMemoryPerQuery` limits the amounts of memory, which can be used for processing a single query. Queries, which need more memory, are rejected. By default this limit is calculated by dividing `-search.allowedPercent` by `-search.maxConcurrentRequests`. Sometimes a heavy query, which selects big number of time series, may exceed the per-query memory limit by a small percent. The total memory limit for concurrently executed queries can be estimated as `-search.maxMemoryPerQuery` multiplied by `-search.maxConcurrentRequests`.
 - `-search.maxUniqueTimeseries` limits the number of unique time series a single query can find and process. VictoriaMetrics keeps in memory some metainformation about the time series located by each query and spends some CPU time for processing the found time series. This means that the maximum memory usage and CPU usage a single query can use is proportional to `-search.maxUniqueTimeseries`.
 - `-search.maxQueryDuration` limits the duration of a single query. If the query takes longer than the given duration, then it is canceled. This allows saving CPU and RAM when executing unexpected heavy queries.
 - `-search.maxConcurrentRequests` limits the number of concurrent requests VictoriaMetrics can process. Bigger number of concurrent requests usually means bigger memory usage. For example, if a single query needs 100 MiB of additional memory during its execution, then 100 concurrent queries may need `100 * 100 MiB = 10 GiB` of additional memory. So it is better to limit the number of concurrent queries, while suspending additional incoming queries if the concurrency limit is reached. VictoriaMetrics provides `-search.maxQueueDuration` command-line flag for limiting the max wait time for suspended queries.
diff --git a/docs/Single-server-VictoriaMetrics.md b/docs/Single-server-VictoriaMetrics.md
index 0a95c33d7..9aee66f29 100644
--- a/docs/Single-server-VictoriaMetrics.md
+++ b/docs/Single-server-VictoriaMetrics.md
@@ -1267,7 +1267,8 @@ See also [resource usage limits docs](#resource-usage-limits).
 
 By default VictoriaMetrics is tuned for an optimal resource usage under typical workloads. Some workloads may need fine-grained resource usage limits. In these cases the following command-line flags may be useful:
 
-- `-memory.allowedPercent` and `-search.allowedBytes` limit the amounts of memory, which may be used for various internal caches at VictoriaMetrics. Note that VictoriaMetrics may use more memory, since these flags don't limit additional memory, which may be needed on a per-query basis.
+- `-memory.allowedPercent` and `-memory.allowedBytes` limit the amounts of memory, which may be used for various internal caches at VictoriaMetrics. Note that VictoriaMetrics may use more memory, since these flags don't limit additional memory, which may be needed on a per-query basis.
+- `-search.maxMemoryPerQuery` limits the amounts of memory, which can be used for processing a single query. Queries, which need more memory, are rejected. By default this limit is calculated by dividing `-search.allowedPercent` by `-search.maxConcurrentRequests`. Sometimes a heavy query, which selects big number of time series, may exceed the per-query memory limit by a small percent. The total memory limit for concurrently executed queries can be estimated as `-search.maxMemoryPerQuery` multiplied by `-search.maxConcurrentRequests`.
 - `-search.maxUniqueTimeseries` limits the number of unique time series a single query can find and process. VictoriaMetrics keeps in memory some metainformation about the time series located by each query and spends some CPU time for processing the found time series. This means that the maximum memory usage and CPU usage a single query can use is proportional to `-search.maxUniqueTimeseries`.
 - `-search.maxQueryDuration` limits the duration of a single query. If the query takes longer than the given duration, then it is canceled. This allows saving CPU and RAM when executing unexpected heavy queries.
 - `-search.maxConcurrentRequests` limits the number of concurrent requests VictoriaMetrics can process. Bigger number of concurrent requests usually means bigger memory usage. For example, if a single query needs 100 MiB of additional memory during its execution, then 100 concurrent queries may need `100 * 100 MiB = 10 GiB` of additional memory. So it is better to limit the number of concurrent queries, while suspending additional incoming queries if the concurrency limit is reached. VictoriaMetrics provides `-search.maxQueueDuration` command-line flag for limiting the max wait time for suspended queries.