app/vmselect: follow-up for 626073bca8

* Rename -search.maxMetricsPointSearch to -search.maxSamplesPerQuery, so it is more consistent with the existing -search.maxSamplesPerSeries * Move the -search.maxSamplesPerQuery from vmstorage to vmselect, so it could effectively limit the number of raw samples obtained from all the vmstorage nodes * Document the -search.maxSamplesPerQuery in docs/CHANGELOG.md
2024-11-23 12:31:07 +01:00 · 2021-07-28 17:40:09 +03:00 · 2021-07-28 17:40:09 +03:00 · 8ee8660ac4
commit 8ee8660ac4
parent 9ffd70a921
7 changed files with 31 additions and 6 deletions
--- a/README.md
+++ b/README.md
@ -1785,6 +1785,8 @@ Pass `-help` to VictoriaMetrics in order to see the list of supported command-li
    	Supports the following optional suffixes for size values: KB, MB, GB, KiB, MiB, GiB (default 16384)
  -search.maxQueueDuration duration
    	The maximum time the request waits for execution when -search.maxConcurrentRequests limit is reached; see also -search.maxQueryDuration (default 10s)
+  -search.maxSamplesPerQuery int
+    	The maximum number of raw samples a single query can process across all time series. This protects from heavy queries, which select unexpectedly high number of raw samples. See also -search.maxSamplesPerSeries (default 1000000000)
  -search.maxSamplesPerSeries int
    	The maximum number of raw samples a single query can scan per each time series. This option allows limiting memory usage (default 30000000)
  -search.maxStalenessInterval duration
--- a/app/vmselect/netstorage/netstorage.go
+++ b/app/vmselect/netstorage/netstorage.go
@ -28,6 +28,7 @@ var (
 	maxTagValueSuffixesPerSearch = flag.Int("search.maxTagValueSuffixesPerSearch", 100e3, "The maximum number of tag value suffixes returned from /metrics/find")
 	maxMetricsPerSearch          = flag.Int("search.maxUniqueTimeseries", 300e3, "The maximum number of unique time series each search can scan. This option allows limiting memory usage")
 	maxSamplesPerSeries          = flag.Int("search.maxSamplesPerSeries", 30e6, "The maximum number of raw samples a single query can scan per each time series. This option allows limiting memory usage")
+	maxSamplesPerQuery           = flag.Int("search.maxSamplesPerQuery", 1e9, "The maximum number of raw samples a single query can process across all time series. This protects from heavy queries, which select unexpectedly high number of raw samples. See also -search.maxSamplesPerSeries")
 )

 // Result is a single timeseries result.
@ -423,7 +424,7 @@ func (pts *packedTimeseries) Unpack(dst *Result, tbf *tmpBlocksFile, tr storage.
 			for _, sb := range upw.sbs {
 				samples += len(sb.Timestamps)
 			}
-			if samples < *maxSamplesPerSeries {
+			if *maxSamplesPerSeries <= 0 || samples < *maxSamplesPerSeries {
 				sbs = append(sbs, upw.sbs...)
 			} else {
 				firstErr = fmt.Errorf("cannot process more than %d samples per series; either increase -search.maxSamplesPerSeries "+
@ -1006,6 +1007,7 @@ func ProcessSearchQuery(sq *storage.SearchQuery, fetchData bool, deadline search
 	m := make(map[string][]blockRef, maxSeriesCount)
 	orderedMetricNames := make([]string, 0, maxSeriesCount)
 	blocksRead := 0
+	samples := 0
 	tbf := getTmpBlocksFile()
 	var buf []byte
 	for sr.NextMetricBlock() {
@ -1015,7 +1017,14 @@ func ProcessSearchQuery(sq *storage.SearchQuery, fetchData bool, deadline search
 			putStorageSearch(sr)
 			return nil, fmt.Errorf("timeout exceeded while fetching data block #%d from storage: %s", blocksRead, deadline.String())
 		}
-		buf = sr.MetricBlockRef.BlockRef.Marshal(buf[:0])
+		br := sr.MetricBlockRef.BlockRef
+		samples += br.RowsCount()
+		if *maxSamplesPerQuery > 0 && samples > *maxSamplesPerQuery {
+			putTmpBlocksFile(tbf)
+			putStorageSearch(sr)
+			return nil, fmt.Errorf("cannot select more than -search.maxSamplesPerQuery=%d samples; possible solutions: to increase the -search.maxSamplesPerQuery; to reduce time range for the query; to use more specific label filters in order to select lower number of series", *maxSamplesPerQuery)
+		}
+		buf = br.Marshal(buf[:0])
 		addr, err := tbf.WriteBlockRefData(buf)
 		if err != nil {
 			putTmpBlocksFile(tbf)
@ -1026,7 +1035,7 @@ func ProcessSearchQuery(sq *storage.SearchQuery, fetchData bool, deadline search
 		metricNameStrUnsafe := bytesutil.ToUnsafeString(metricName)
 		brs := m[metricNameStrUnsafe]
 		brs = append(brs, blockRef{
-			partRef: sr.MetricBlockRef.BlockRef.PartRef(),
+			partRef: br.PartRef(),
 			addr:    addr,
 		})
 		if len(brs) > 1 {
--- a/docs/CHANGELOG.md
+++ b/docs/CHANGELOG.md
@ -6,7 +6,8 @@ sort: 15

 ## tip

-* FEATURE: add `-search.maxSamplesPerSeries` command-line flag for limiting the number of raw samples a single query could process per each time series. This option can prevent from out of memory errors when a query processes tens of millions of raw samples per series. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1067).
+* FEATURE: add `-search.maxSamplesPerSeries` command-line flag for limiting the number of raw samples a single query can process per each time series. This option can protect from out of memory errors when a query processes tens of millions of raw samples per series. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1067).
+* FEATURE: add `-search.maxSamplesPerQuery` command-line flag for limiting the number of raw samples a single query can process across all the time series. This option can protect from heavy queries, which select too big number of raw samples. Thanks to @jiangxinlingdu for [the initial pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/1478).

 * BUGFIX: vmbackup: automatically set default `us-east1` S3 region if it is missing. This should simplify using S3-compatible services such as MinIO for backups. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1449).
 * BUGFIX: vmselect: prevent from possible deadlock when multiple `target` query args are passed to [Graphite Render API](https://docs.victoriametrics.com/#graphite-render-api-usage).
--- a/docs/Cluster-VictoriaMetrics.md
+++ b/docs/Cluster-VictoriaMetrics.md
@ -654,6 +654,10 @@ Below is the output for `/path/to/vmselect -help`:
    	Supports the following optional suffixes for size values: KB, MB, GB, KiB, MiB, GiB (default 16384)
  -search.maxQueueDuration duration
    	The maximum time the request waits for execution when -search.maxConcurrentRequests limit is reached; see also -search.maxQueryDuration (default 10s)
+  -search.maxSamplesPerQuery int
+    	The maximum number of raw samples a single query can process across all time series. This protects from heavy queries, which select unexpectedly high number of raw samples. See also -search.maxSamplesPerSeries (default 1000000000)
+  -search.maxSamplesPerSeries int
+    	The maximum number of raw samples a single query can scan per each time series. See also -search.maxSamplesPerQuery (default 30000000)
  -search.maxStalenessInterval duration
    	The maximum interval for staleness calculations. By default it is automatically calculated from the median interval between samples. This flag could be useful for tuning Prometheus data model closer to Influx-style data model. See https://prometheus.io/docs/prometheus/latest/querying/basics/#staleness for details. See also '-search.maxLookback' flag, which has the same meaning due to historical reasons
  -search.maxStatusRequestDuration duration
@ -665,7 +669,7 @@ Below is the output for `/path/to/vmselect -help`:
  -search.queryStats.lastQueriesCount int
    	Query stats for /api/v1/status/top_queries is tracked on this number of last queries. Zero value disables query stats tracking (default 20000)
  -search.queryStats.minQueryDuration duration
-    	The minimum duration for queries to track in query stats at /api/v1/status/top_queries. Queries with lower duration are ignored in query stats
+    	The minimum duration for queries to track in query stats at /api/v1/status/top_queries. Queries with lower duration are ignored in query stats (default 1ms)
  -search.resetCacheAuthKey string
    	Optional authKey for resetting rollup cache via /internal/resetRollupResultCache call
  -search.treatDotsAsIsInRegexps
@ -760,7 +764,7 @@ Below is the output for `/path/to/vmstorage -help`:
  -search.maxTagValues int
    	The maximum number of tag values returned per search (default 100000)
  -search.maxUniqueTimeseries int
-    	The maximum number of unique time series each search can scan (default 300000)
+    	The maximum number of unique time series a single query can process. This allows protecting against heavy queries, which select unexpectedly high number of series. See also -search.maxSamplesPerQuery and -search.maxSamplesPerSeries (default 300000)
  -smallMergeConcurrency int
    	The maximum number of CPU cores to use for small merges. Default value is used if set to 0
  -snapshotAuthKey string
--- a/docs/README.md
+++ b/docs/README.md
@ -1785,6 +1785,8 @@ Pass `-help` to VictoriaMetrics in order to see the list of supported command-li
    	Supports the following optional suffixes for size values: KB, MB, GB, KiB, MiB, GiB (default 16384)
  -search.maxQueueDuration duration
    	The maximum time the request waits for execution when -search.maxConcurrentRequests limit is reached; see also -search.maxQueryDuration (default 10s)
+  -search.maxSamplesPerQuery int
+    	The maximum number of raw samples a single query can process across all time series. This protects from heavy queries, which select unexpectedly high number of raw samples. See also -search.maxSamplesPerSeries (default 1000000000)
  -search.maxSamplesPerSeries int
    	The maximum number of raw samples a single query can scan per each time series. This option allows limiting memory usage (default 30000000)
  -search.maxStalenessInterval duration
--- a/docs/Single-server-VictoriaMetrics.md
+++ b/docs/Single-server-VictoriaMetrics.md
@ -1789,6 +1789,8 @@ Pass `-help` to VictoriaMetrics in order to see the list of supported command-li
    	Supports the following optional suffixes for size values: KB, MB, GB, KiB, MiB, GiB (default 16384)
  -search.maxQueueDuration duration
    	The maximum time the request waits for execution when -search.maxConcurrentRequests limit is reached; see also -search.maxQueryDuration (default 10s)
+  -search.maxSamplesPerQuery int
+    	The maximum number of raw samples a single query can process across all time series. This protects from heavy queries, which select unexpectedly high number of raw samples. See also -search.maxSamplesPerSeries (default 1000000000)
  -search.maxSamplesPerSeries int
    	The maximum number of raw samples a single query can scan per each time series. This option allows limiting memory usage (default 30000000)
  -search.maxStalenessInterval duration
--- a/lib/storage/search.go
+++ b/lib/storage/search.go
@ -48,6 +48,11 @@ func (br *BlockRef) Marshal(dst []byte) []byte {
 	return br.bh.Marshal(dst)
 }

+// RowsCount returns the number of rows in br.
+func (br *BlockRef) RowsCount() int {
+	return int(br.bh.RowsCount)
+}
+
 // PartRef returns PartRef from br.
 func (br *BlockRef) PartRef() PartRef {
 	return PartRef{