Revert "app/vmselect: fix the way of counting raw samples in single query (#6464)"

This reverts commit 6e395048d3. Reason for revert: the previous logic was correct. The purpose of `-search.maxSamplesPerQuery` command-line flag is to limit the amounts of CPU resources, which could be taken by a single query - see https://docs.victoriametrics.com/#resource-usage-limits . VictoriaMetrics processes samples in blocks during querying - it reads the block, then unpacks it, then filters out samples outside the selected time range. This means that it _spends CPU time_ on reading and unpacking of _all the samples_ in every block on the requested time range, even if only a single sample per each block matches the given time range. The previous logic was effectively limiting CPU time a single query could take. The new logic fails limiting CPU time a single query could take in some pathological cases when only a small fraction of samples per each requested block fit the requested time range. This allows performing multiplication DoS-attacks by querying very narrow time ranges over historical blocks, which tend to be full. For example, if the `-search.maxSamplesPerQuery` equals to a billion, and the query requests a single sample out of 8K samples per each block, this means that the query may unpack a billion of such blocks without exceeding the limit, e.g. it may unpack and process 8K*1e9=8e12 samples. This is not what the resource usage limits were created for originally - see https://docs.victoriametrics.com/#resource-usage-limits Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5851 Updates https://github.com/VictoriaMetrics/VictoriaMetrics/pull/6464
2025-01-20 07:19:17 +01:00 · 2024-06-25 02:43:57 +02:00 · 2024-06-25 02:43:57 +02:00 · b07a02c516
commit b07a02c516
parent 10390a7dfc
2 changed files with 10 additions and 27 deletions
--- a/app/vmselect/netstorage/netstorage.go
+++ b/app/vmselect/netstorage/netstorage.go
@ -91,13 +91,6 @@ type timeseriesWork struct {
 	err      error

 	rowsProcessed int
-
-	querySamplesQuota *querySamplesQuota
-}
-
-type querySamplesQuota struct {
-	mu           sync.Mutex
-	samplesQuota int
 }

 func (tsw *timeseriesWork) do(r *Result, workerID uint) error {
@ -114,17 +107,6 @@ func (tsw *timeseriesWork) do(r *Result, workerID uint) error {
 		return fmt.Errorf("error during time series unpacking: %w", err)
 	}
 	tsw.rowsProcessed = len(r.Timestamps)
-
-	tsw.querySamplesQuota.mu.Lock()
-	tsw.querySamplesQuota.samplesQuota -= tsw.rowsProcessed
-	if tsw.querySamplesQuota.samplesQuota < 0 {
-		tsw.mustStop.Store(true)
-		tsw.querySamplesQuota.mu.Unlock()
-		return fmt.Errorf("cannot select more than -search.maxSamplesPerQuery=%d samples; possible solutions: increase the -search.maxSamplesPerQuery; "+
-			"reduce time range for the query; use more specific label filters in order to select fewer series", *maxSamplesPerQuery)
-	}
-	tsw.querySamplesQuota.mu.Unlock()
-
 	if len(r.Timestamps) > 0 {
 		if err := tsw.f(r, workerID); err != nil {
 			tsw.mustStop.Store(true)
@ -260,16 +242,11 @@ func (rss *Results) runParallel(qt *querytracer.Tracer, f func(rs *Result, worke
 	}

 	var mustStop atomic.Bool
-	limit := *maxSamplesPerQuery
-	sampleQuota := &querySamplesQuota{
-		samplesQuota: limit,
-	}
 	initTimeseriesWork := func(tsw *timeseriesWork, pts *packedTimeseries) {
 		tsw.rss = rss
 		tsw.pts = pts
 		tsw.f = f
 		tsw.mustStop = &mustStop
-		tsw.querySamplesQuota = sampleQuota
 	}
 	maxWorkers := MaxWorkers()
 	if maxWorkers == 1 || tswsLen == 1 {
@ -1173,7 +1150,7 @@ func ProcessSearchQuery(qt *querytracer.Tracer, sq *storage.SearchQuery, deadlin
 	}

 	blocksRead := 0
-	blockSamples := 0
+	samples := 0
 	tbf := getTmpBlocksFile()
 	var buf []byte
 	var metricNamePrev []byte
@ -1215,7 +1192,14 @@ func ProcessSearchQuery(qt *querytracer.Tracer, sq *storage.SearchQuery, deadlin
 			return nil, fmt.Errorf("timeout exceeded while fetching data block #%d from storage: %s", blocksRead, deadline.String())
 		}
 		br := sr.MetricBlockRef.BlockRef
-		blockSamples += br.RowsCount()
+		samples += br.RowsCount()
+		if *maxSamplesPerQuery > 0 && samples > *maxSamplesPerQuery {
+			putTmpBlocksFile(tbf)
+			putStorageSearch(sr)
+			return nil, fmt.Errorf("cannot select more than -search.maxSamplesPerQuery=%d samples; possible solutions: increase the -search.maxSamplesPerQuery; "+
+				"reduce time range for the query; use more specific label filters in order to select fewer series", *maxSamplesPerQuery)
+		}
+
 		buf = br.Marshal(buf[:0])
 		addr, err := tbf.WriteBlockRefData(buf)
 		if err != nil {
@ -1289,7 +1273,7 @@ func ProcessSearchQuery(qt *querytracer.Tracer, sq *storage.SearchQuery, deadlin
 		putStorageSearch(sr)
 		return nil, fmt.Errorf("cannot finalize temporary file: %w", err)
 	}
-	qt.Printf("fetch unique series=%d, blocks=%d, samples=%d, bytes=%d", len(m), blocksRead, blockSamples, tbf.Len())
+	qt.Printf("fetch unique series=%d, blocks=%d, samples=%d, bytes=%d", len(m), blocksRead, samples, tbf.Len())

 	var rss Results
 	rss.tr = tr
--- a/docs/CHANGELOG.md
+++ b/docs/CHANGELOG.md
@ -58,7 +58,6 @@ Released at 2024-06-24
 * BUGFIX: [vmalert](https://docs.victoriametrics.com/vmalert/) enterprise: properly configure authentication with S3 when `-s3.configFilePath` cmd-line flag is specified for reading rule configs.
 * BUGFIX: [vmalert](https://docs.victoriametrics.com/vmalert/): properly specify oauth2 `ClientSecret` when configuring authentication for `notifier.url`. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6471) for details. Thanks to @yincongcyincong for the [pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/6478).
 * BUGFIX: [Single-node VictoriaMetrics](https://docs.victoriametrics.com/) and `vmstorage` in [VictoriaMetrics cluster](https://docs.victoriametrics.com/cluster-victoriametrics/): add validation for the max value specified for `-retentionPeriod`. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6330) for details.
-* BUGFIX: [vmselect](https://docs.victoriametrics.com/cluster-victoriametrics/): calculate the exact number of [raw samples](https://docs.victoriametrics.com/keyconcepts/#raw-samples) during query processing, the limit is specified via command-line flag `-search.maxSamplesPerQuery`. Previously, samples could have been over-counted and lead to false-positive errors of `maxSamplesPerQuery`limit exceeded. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5851).
 * BUGFIX: [vmui](https://docs.victoriametrics.com/#vmui): **copy row** button in Table view produces unexpected result. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6421) and [pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/6495).
 * BUGFIX: [vmalert-tool](https://docs.victoriametrics.com/vmalert-tool/): prevent hanging when processing groups without rules. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6500).