app/vmselect: follow-up for 626073bca8

* Rename -search.maxMetricsPointSearch to -search.maxSamplesPerQuery, so it is more consistent with the existing -search.maxSamplesPerSeries * Move the -search.maxSamplesPerQuery from vmstorage to vmselect, so it could effectively limit the number of raw samples obtained from all the vmstorage nodes * Document the -search.maxSamplesPerQuery in docs/CHANGELOG.md
2025-01-20 07:19:17 +01:00 · 2021-07-28 17:40:09 +03:00 · 2021-07-28 17:40:09 +03:00 · 49bf3abf67
commit 49bf3abf67
parent 626073bca8
7 changed files with 25 additions and 18 deletions
--- a/README.md
+++ b/README.md
@ -650,8 +650,10 @@ Below is the output for `/path/to/vmselect -help`:
    	Supports the following optional suffixes for size values: KB, MB, GB, KiB, MiB, GiB (default 16384)
  -search.maxQueueDuration duration
    	The maximum time the request waits for execution when -search.maxConcurrentRequests limit is reached; see also -search.maxQueryDuration (default 10s)
+  -search.maxSamplesPerQuery int
+    	The maximum number of raw samples a single query can process across all time series. This protects from heavy queries, which select unexpectedly high number of raw samples. See also -search.maxSamplesPerSeries (default 1000000000)
  -search.maxSamplesPerSeries int
-    	The maximum number of raw samples a single query can scan per each time series. This option allows limiting memory usage (default 30000000)
+    	The maximum number of raw samples a single query can scan per each time series. See also -search.maxSamplesPerQuery (default 30000000)
  -search.maxStalenessInterval duration
    	The maximum interval for staleness calculations. By default it is automatically calculated from the median interval between samples. This flag could be useful for tuning Prometheus data model closer to Influx-style data model. See https://prometheus.io/docs/prometheus/latest/querying/basics/#staleness for details. See also '-search.maxLookback' flag, which has the same meaning due to historical reasons
  -search.maxStatusRequestDuration duration
@ -758,7 +760,7 @@ Below is the output for `/path/to/vmstorage -help`:
  -search.maxTagValues int
    	The maximum number of tag values returned per search (default 100000)
  -search.maxUniqueTimeseries int
-    	The maximum number of unique time series each search can scan (default 300000)
+    	The maximum number of unique time series a single query can process. This allows protecting against heavy queries, which select unexpectedly high number of series. See also -search.maxSamplesPerQuery and -search.maxSamplesPerSeries (default 300000)
  -smallMergeConcurrency int
    	The maximum number of CPU cores to use for small merges. Default value is used if set to 0
  -snapshotAuthKey string
--- a/app/vmselect/netstorage/netstorage.go
+++ b/app/vmselect/netstorage/netstorage.go
@ -35,7 +35,8 @@ import (
 var (
 	replicationFactor = flag.Int("replicationFactor", 1, "How many copies of every time series is available on vmstorage nodes. "+
 		"See -replicationFactor command-line flag for vminsert nodes")
-	maxSamplesPerSeries = flag.Int("search.maxSamplesPerSeries", 30e6, "The maximum number of raw samples a single query can scan per each time series. This option allows limiting memory usage")
+	maxSamplesPerSeries = flag.Int("search.maxSamplesPerSeries", 30e6, "The maximum number of raw samples a single query can scan per each time series. See also -search.maxSamplesPerQuery")
+	maxSamplesPerQuery  = flag.Int("search.maxSamplesPerQuery", 1e9, "The maximum number of raw samples a single query can process across all time series. This protects from heavy queries, which select unexpectedly high number of raw samples. See also -search.maxSamplesPerSeries")
 )

 // Result is a single timeseries result.
@ -433,7 +434,7 @@ func (pts *packedTimeseries) Unpack(tbf *tmpBlocksFile, dst *Result, tr storage.
 			for _, sb := range upw.sbs {
 				samples += len(sb.Timestamps)
 			}
-			if samples < *maxSamplesPerSeries {
+			if *maxSamplesPerSeries <= 0 || samples < *maxSamplesPerSeries {
 				sbs = append(sbs, upw.sbs...)
 			} else {
 				firstErr = fmt.Errorf("cannot process more than %d samples per series; either increase -search.maxSamplesPerSeries "+
@ -1390,6 +1391,7 @@ func ProcessSearchQuery(at *auth.Token, denyPartialResponse bool, sq *storage.Se
 	}
 	var wg syncwg.WaitGroup
 	var stopped uint32
+	var samples uint64
 	processBlock := func(mb *storage.MetricBlock) error {
 		wg.Add(1)
 		defer wg.Done()
@ -1400,6 +1402,10 @@ func ProcessSearchQuery(at *auth.Token, denyPartialResponse bool, sq *storage.Se
 			tbfw.RegisterEmptyBlock(mb)
 			return nil
 		}
+		n := atomic.AddUint64(&samples, uint64(mb.Block.RowsCount()))
+		if *maxSamplesPerQuery > 0 && n > uint64(*maxSamplesPerQuery) {
+			return fmt.Errorf("cannot select more than -search.maxSamplesPerQuery=%d samples; possible solutions: to increase the -search.maxSamplesPerQuery; to reduce time range for the query; to use more specific label filters in order to select lower number of series", *maxSamplesPerQuery)
+		}
 		if err := tbfw.RegisterAndWriteBlock(mb); err != nil {
 			return fmt.Errorf("cannot write MetricBlock to temporary blocks file: %w", err)
 		}
--- a/app/vmstorage/transport/server.go
+++ b/app/vmstorage/transport/server.go
@ -28,8 +28,7 @@ var (
 	maxTagKeysPerSearch          = flag.Int("search.maxTagKeys", 100e3, "The maximum number of tag keys returned per search")
 	maxTagValuesPerSearch        = flag.Int("search.maxTagValues", 100e3, "The maximum number of tag values returned per search")
 	maxTagValueSuffixesPerSearch = flag.Int("search.maxTagValueSuffixesPerSearch", 100e3, "The maximum number of tag value suffixes returned from /metrics/find")
-	maxMetricsPerSearch          = flag.Int("search.maxUniqueTimeseries", 300e3, "The maximum number of unique time series each search can scan")
-	maxMetricsPointSearch        = flag.Int("search.maxMetricsPointSearch", 300e3, "control search metrics point number")
+	maxMetricsPerSearch          = flag.Int("search.maxUniqueTimeseries", 300e3, "The maximum number of unique time series a single query can process. This allows protecting against heavy queries, which select unexpectedly high number of series. See also -search.maxSamplesPerQuery and -search.maxSamplesPerSeries")

 	precisionBits         = flag.Int("precisionBits", 64, "The number of precision bits to store per each value. Lower precision bits improves data compression at the cost of precision loss")
 	disableRPCCompression = flag.Bool(`rpc.disableCompression`, false, "Disable compression of RPC traffic. This reduces CPU usage at the cost of higher network bandwidth usage")
@ -1071,20 +1070,13 @@ func (s *Server) processVMSelectSearch(ctx *vmselectRequestCtx) error {
 		return fmt.Errorf("cannot send empty error message: %w", err)
 	}

-	count := 0
 	// Send found blocks to vmselect.
 	for ctx.sr.NextMetricBlock() {
 		ctx.mb.MetricName = ctx.sr.MetricBlockRef.MetricName
 		ctx.sr.MetricBlockRef.BlockRef.MustReadBlock(&ctx.mb.Block, fetchData)

 		vmselectMetricBlocksRead.Inc()
-		rowsCount := ctx.mb.Block.RowsCount()
-		vmselectMetricRowsRead.Add(rowsCount)
-		count += rowsCount
-		if count > *maxMetricsPointSearch {
-			logger.Errorf("more than -search.maxMetricsPointSearch=%d point,discard more points", *maxMetricsPointSearch)
-			break
-		}
+		vmselectMetricRowsRead.Add(ctx.mb.Block.RowsCount())

 		ctx.dataBuf = ctx.mb.Marshal(ctx.dataBuf[:0])
 		if err := ctx.writeDataBufBytes(); err != nil {
--- a/docs/CHANGELOG.md
+++ b/docs/CHANGELOG.md
@ -6,7 +6,8 @@ sort: 15

 ## tip

-* FEATURE: add `-search.maxSamplesPerSeries` command-line flag for limiting the number of raw samples a single query could process per each time series. This option can prevent from out of memory errors when a query processes tens of millions of raw samples per series. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1067).
+* FEATURE: add `-search.maxSamplesPerSeries` command-line flag for limiting the number of raw samples a single query can process per each time series. This option can protect from out of memory errors when a query processes tens of millions of raw samples per series. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1067).
+* FEATURE: add `-search.maxSamplesPerQuery` command-line flag for limiting the number of raw samples a single query can process across all the time series. This option can protect from heavy queries, which select too big number of raw samples. Thanks to @jiangxinlingdu for [the initial pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/1478).

 * BUGFIX: vmbackup: automatically set default `us-east1` S3 region if it is missing. This should simplify using S3-compatible services such as MinIO for backups. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1449).
 * BUGFIX: vmselect: prevent from possible deadlock when multiple `target` query args are passed to [Graphite Render API](https://docs.victoriametrics.com/#graphite-render-api-usage).
--- a/docs/Cluster-VictoriaMetrics.md
+++ b/docs/Cluster-VictoriaMetrics.md
@ -654,8 +654,10 @@ Below is the output for `/path/to/vmselect -help`:
    	Supports the following optional suffixes for size values: KB, MB, GB, KiB, MiB, GiB (default 16384)
  -search.maxQueueDuration duration
    	The maximum time the request waits for execution when -search.maxConcurrentRequests limit is reached; see also -search.maxQueryDuration (default 10s)
+  -search.maxSamplesPerQuery int
+    	The maximum number of raw samples a single query can process across all time series. This protects from heavy queries, which select unexpectedly high number of raw samples. See also -search.maxSamplesPerSeries (default 1000000000)
  -search.maxSamplesPerSeries int
-    	The maximum number of raw samples a single query can scan per each time series. This option allows limiting memory usage (default 30000000)
+    	The maximum number of raw samples a single query can scan per each time series. See also -search.maxSamplesPerQuery (default 30000000)
  -search.maxStalenessInterval duration
    	The maximum interval for staleness calculations. By default it is automatically calculated from the median interval between samples. This flag could be useful for tuning Prometheus data model closer to Influx-style data model. See https://prometheus.io/docs/prometheus/latest/querying/basics/#staleness for details. See also '-search.maxLookback' flag, which has the same meaning due to historical reasons
  -search.maxStatusRequestDuration duration
@ -667,7 +669,7 @@ Below is the output for `/path/to/vmselect -help`:
  -search.queryStats.lastQueriesCount int
    	Query stats for /api/v1/status/top_queries is tracked on this number of last queries. Zero value disables query stats tracking (default 20000)
  -search.queryStats.minQueryDuration duration
-    	The minimum duration for queries to track in query stats at /api/v1/status/top_queries. Queries with lower duration are ignored in query stats
+    	The minimum duration for queries to track in query stats at /api/v1/status/top_queries. Queries with lower duration are ignored in query stats (default 1ms)
  -search.resetCacheAuthKey string
    	Optional authKey for resetting rollup cache via /internal/resetRollupResultCache call
  -search.treatDotsAsIsInRegexps
@ -762,7 +764,7 @@ Below is the output for `/path/to/vmstorage -help`:
  -search.maxTagValues int
    	The maximum number of tag values returned per search (default 100000)
  -search.maxUniqueTimeseries int
-    	The maximum number of unique time series each search can scan (default 300000)
+    	The maximum number of unique time series a single query can process. This allows protecting against heavy queries, which select unexpectedly high number of series. See also -search.maxSamplesPerQuery and -search.maxSamplesPerSeries (default 300000)
  -smallMergeConcurrency int
    	The maximum number of CPU cores to use for small merges. Default value is used if set to 0
  -snapshotAuthKey string
--- a/docs/README.md
+++ b/docs/README.md
@ -1785,6 +1785,8 @@ Pass `-help` to VictoriaMetrics in order to see the list of supported command-li
    	Supports the following optional suffixes for size values: KB, MB, GB, KiB, MiB, GiB (default 16384)
  -search.maxQueueDuration duration
    	The maximum time the request waits for execution when -search.maxConcurrentRequests limit is reached; see also -search.maxQueryDuration (default 10s)
+  -search.maxSamplesPerQuery int
+    	The maximum number of raw samples a single query can process across all time series. This protects from heavy queries, which select unexpectedly high number of raw samples. See also -search.maxSamplesPerSeries (default 1000000000)
  -search.maxSamplesPerSeries int
    	The maximum number of raw samples a single query can scan per each time series. This option allows limiting memory usage (default 30000000)
  -search.maxStalenessInterval duration
--- a/docs/Single-server-VictoriaMetrics.md
+++ b/docs/Single-server-VictoriaMetrics.md
@ -1789,6 +1789,8 @@ Pass `-help` to VictoriaMetrics in order to see the list of supported command-li
    	Supports the following optional suffixes for size values: KB, MB, GB, KiB, MiB, GiB (default 16384)
  -search.maxQueueDuration duration
    	The maximum time the request waits for execution when -search.maxConcurrentRequests limit is reached; see also -search.maxQueryDuration (default 10s)
+  -search.maxSamplesPerQuery int
+    	The maximum number of raw samples a single query can process across all time series. This protects from heavy queries, which select unexpectedly high number of raw samples. See also -search.maxSamplesPerSeries (default 1000000000)
  -search.maxSamplesPerSeries int
    	The maximum number of raw samples a single query can scan per each time series. This option allows limiting memory usage (default 30000000)
  -search.maxStalenessInterval duration