diff --git a/README.md b/README.md index 2139bb7b4d..5d5e6b902a 100644 --- a/README.md +++ b/README.md @@ -650,8 +650,10 @@ Below is the output for `/path/to/vmselect -help`: Supports the following optional suffixes for size values: KB, MB, GB, KiB, MiB, GiB (default 16384) -search.maxQueueDuration duration The maximum time the request waits for execution when -search.maxConcurrentRequests limit is reached; see also -search.maxQueryDuration (default 10s) + -search.maxSamplesPerQuery int + The maximum number of raw samples a single query can process across all time series. This protects from heavy queries, which select unexpectedly high number of raw samples. See also -search.maxSamplesPerSeries (default 1000000000) -search.maxSamplesPerSeries int - The maximum number of raw samples a single query can scan per each time series. This option allows limiting memory usage (default 30000000) + The maximum number of raw samples a single query can scan per each time series. See also -search.maxSamplesPerQuery (default 30000000) -search.maxStalenessInterval duration The maximum interval for staleness calculations. By default it is automatically calculated from the median interval between samples. This flag could be useful for tuning Prometheus data model closer to Influx-style data model. See https://prometheus.io/docs/prometheus/latest/querying/basics/#staleness for details. See also '-search.maxLookback' flag, which has the same meaning due to historical reasons -search.maxStatusRequestDuration duration @@ -758,7 +760,7 @@ Below is the output for `/path/to/vmstorage -help`: -search.maxTagValues int The maximum number of tag values returned per search (default 100000) -search.maxUniqueTimeseries int - The maximum number of unique time series each search can scan (default 300000) + The maximum number of unique time series a single query can process. This allows protecting against heavy queries, which select unexpectedly high number of series. See also -search.maxSamplesPerQuery and -search.maxSamplesPerSeries (default 300000) -smallMergeConcurrency int The maximum number of CPU cores to use for small merges. Default value is used if set to 0 -snapshotAuthKey string diff --git a/app/vmselect/netstorage/netstorage.go b/app/vmselect/netstorage/netstorage.go index 5e1cfa52fa..2d19def4e3 100644 --- a/app/vmselect/netstorage/netstorage.go +++ b/app/vmselect/netstorage/netstorage.go @@ -35,7 +35,8 @@ import ( var ( replicationFactor = flag.Int("replicationFactor", 1, "How many copies of every time series is available on vmstorage nodes. "+ "See -replicationFactor command-line flag for vminsert nodes") - maxSamplesPerSeries = flag.Int("search.maxSamplesPerSeries", 30e6, "The maximum number of raw samples a single query can scan per each time series. This option allows limiting memory usage") + maxSamplesPerSeries = flag.Int("search.maxSamplesPerSeries", 30e6, "The maximum number of raw samples a single query can scan per each time series. See also -search.maxSamplesPerQuery") + maxSamplesPerQuery = flag.Int("search.maxSamplesPerQuery", 1e9, "The maximum number of raw samples a single query can process across all time series. This protects from heavy queries, which select unexpectedly high number of raw samples. See also -search.maxSamplesPerSeries") ) // Result is a single timeseries result. @@ -433,7 +434,7 @@ func (pts *packedTimeseries) Unpack(tbf *tmpBlocksFile, dst *Result, tr storage. for _, sb := range upw.sbs { samples += len(sb.Timestamps) } - if samples < *maxSamplesPerSeries { + if *maxSamplesPerSeries <= 0 || samples < *maxSamplesPerSeries { sbs = append(sbs, upw.sbs...) } else { firstErr = fmt.Errorf("cannot process more than %d samples per series; either increase -search.maxSamplesPerSeries "+ @@ -1390,6 +1391,7 @@ func ProcessSearchQuery(at *auth.Token, denyPartialResponse bool, sq *storage.Se } var wg syncwg.WaitGroup var stopped uint32 + var samples uint64 processBlock := func(mb *storage.MetricBlock) error { wg.Add(1) defer wg.Done() @@ -1400,6 +1402,10 @@ func ProcessSearchQuery(at *auth.Token, denyPartialResponse bool, sq *storage.Se tbfw.RegisterEmptyBlock(mb) return nil } + n := atomic.AddUint64(&samples, uint64(mb.Block.RowsCount())) + if *maxSamplesPerQuery > 0 && n > uint64(*maxSamplesPerQuery) { + return fmt.Errorf("cannot select more than -search.maxSamplesPerQuery=%d samples; possible solutions: to increase the -search.maxSamplesPerQuery; to reduce time range for the query; to use more specific label filters in order to select lower number of series", *maxSamplesPerQuery) + } if err := tbfw.RegisterAndWriteBlock(mb); err != nil { return fmt.Errorf("cannot write MetricBlock to temporary blocks file: %w", err) } diff --git a/app/vmstorage/transport/server.go b/app/vmstorage/transport/server.go index 3ea00f48ec..ace63200ec 100644 --- a/app/vmstorage/transport/server.go +++ b/app/vmstorage/transport/server.go @@ -28,8 +28,7 @@ var ( maxTagKeysPerSearch = flag.Int("search.maxTagKeys", 100e3, "The maximum number of tag keys returned per search") maxTagValuesPerSearch = flag.Int("search.maxTagValues", 100e3, "The maximum number of tag values returned per search") maxTagValueSuffixesPerSearch = flag.Int("search.maxTagValueSuffixesPerSearch", 100e3, "The maximum number of tag value suffixes returned from /metrics/find") - maxMetricsPerSearch = flag.Int("search.maxUniqueTimeseries", 300e3, "The maximum number of unique time series each search can scan") - maxMetricsPointSearch = flag.Int("search.maxMetricsPointSearch", 300e3, "control search metrics point number") + maxMetricsPerSearch = flag.Int("search.maxUniqueTimeseries", 300e3, "The maximum number of unique time series a single query can process. This allows protecting against heavy queries, which select unexpectedly high number of series. See also -search.maxSamplesPerQuery and -search.maxSamplesPerSeries") precisionBits = flag.Int("precisionBits", 64, "The number of precision bits to store per each value. Lower precision bits improves data compression at the cost of precision loss") disableRPCCompression = flag.Bool(`rpc.disableCompression`, false, "Disable compression of RPC traffic. This reduces CPU usage at the cost of higher network bandwidth usage") @@ -1071,20 +1070,13 @@ func (s *Server) processVMSelectSearch(ctx *vmselectRequestCtx) error { return fmt.Errorf("cannot send empty error message: %w", err) } - count := 0 // Send found blocks to vmselect. for ctx.sr.NextMetricBlock() { ctx.mb.MetricName = ctx.sr.MetricBlockRef.MetricName ctx.sr.MetricBlockRef.BlockRef.MustReadBlock(&ctx.mb.Block, fetchData) vmselectMetricBlocksRead.Inc() - rowsCount := ctx.mb.Block.RowsCount() - vmselectMetricRowsRead.Add(rowsCount) - count += rowsCount - if count > *maxMetricsPointSearch { - logger.Errorf("more than -search.maxMetricsPointSearch=%d point,discard more points", *maxMetricsPointSearch) - break - } + vmselectMetricRowsRead.Add(ctx.mb.Block.RowsCount()) ctx.dataBuf = ctx.mb.Marshal(ctx.dataBuf[:0]) if err := ctx.writeDataBufBytes(); err != nil { diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md index 5c6c36296c..d909970f85 100644 --- a/docs/CHANGELOG.md +++ b/docs/CHANGELOG.md @@ -6,7 +6,8 @@ sort: 15 ## tip -* FEATURE: add `-search.maxSamplesPerSeries` command-line flag for limiting the number of raw samples a single query could process per each time series. This option can prevent from out of memory errors when a query processes tens of millions of raw samples per series. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1067). +* FEATURE: add `-search.maxSamplesPerSeries` command-line flag for limiting the number of raw samples a single query can process per each time series. This option can protect from out of memory errors when a query processes tens of millions of raw samples per series. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1067). +* FEATURE: add `-search.maxSamplesPerQuery` command-line flag for limiting the number of raw samples a single query can process across all the time series. This option can protect from heavy queries, which select too big number of raw samples. Thanks to @jiangxinlingdu for [the initial pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/1478). * BUGFIX: vmbackup: automatically set default `us-east1` S3 region if it is missing. This should simplify using S3-compatible services such as MinIO for backups. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1449). * BUGFIX: vmselect: prevent from possible deadlock when multiple `target` query args are passed to [Graphite Render API](https://docs.victoriametrics.com/#graphite-render-api-usage). diff --git a/docs/Cluster-VictoriaMetrics.md b/docs/Cluster-VictoriaMetrics.md index c6521b343e..6afd7a89fc 100644 --- a/docs/Cluster-VictoriaMetrics.md +++ b/docs/Cluster-VictoriaMetrics.md @@ -654,8 +654,10 @@ Below is the output for `/path/to/vmselect -help`: Supports the following optional suffixes for size values: KB, MB, GB, KiB, MiB, GiB (default 16384) -search.maxQueueDuration duration The maximum time the request waits for execution when -search.maxConcurrentRequests limit is reached; see also -search.maxQueryDuration (default 10s) + -search.maxSamplesPerQuery int + The maximum number of raw samples a single query can process across all time series. This protects from heavy queries, which select unexpectedly high number of raw samples. See also -search.maxSamplesPerSeries (default 1000000000) -search.maxSamplesPerSeries int - The maximum number of raw samples a single query can scan per each time series. This option allows limiting memory usage (default 30000000) + The maximum number of raw samples a single query can scan per each time series. See also -search.maxSamplesPerQuery (default 30000000) -search.maxStalenessInterval duration The maximum interval for staleness calculations. By default it is automatically calculated from the median interval between samples. This flag could be useful for tuning Prometheus data model closer to Influx-style data model. See https://prometheus.io/docs/prometheus/latest/querying/basics/#staleness for details. See also '-search.maxLookback' flag, which has the same meaning due to historical reasons -search.maxStatusRequestDuration duration @@ -667,7 +669,7 @@ Below is the output for `/path/to/vmselect -help`: -search.queryStats.lastQueriesCount int Query stats for /api/v1/status/top_queries is tracked on this number of last queries. Zero value disables query stats tracking (default 20000) -search.queryStats.minQueryDuration duration - The minimum duration for queries to track in query stats at /api/v1/status/top_queries. Queries with lower duration are ignored in query stats + The minimum duration for queries to track in query stats at /api/v1/status/top_queries. Queries with lower duration are ignored in query stats (default 1ms) -search.resetCacheAuthKey string Optional authKey for resetting rollup cache via /internal/resetRollupResultCache call -search.treatDotsAsIsInRegexps @@ -762,7 +764,7 @@ Below is the output for `/path/to/vmstorage -help`: -search.maxTagValues int The maximum number of tag values returned per search (default 100000) -search.maxUniqueTimeseries int - The maximum number of unique time series each search can scan (default 300000) + The maximum number of unique time series a single query can process. This allows protecting against heavy queries, which select unexpectedly high number of series. See also -search.maxSamplesPerQuery and -search.maxSamplesPerSeries (default 300000) -smallMergeConcurrency int The maximum number of CPU cores to use for small merges. Default value is used if set to 0 -snapshotAuthKey string diff --git a/docs/README.md b/docs/README.md index f6b77f1618..e88d4a9f70 100644 --- a/docs/README.md +++ b/docs/README.md @@ -1785,6 +1785,8 @@ Pass `-help` to VictoriaMetrics in order to see the list of supported command-li Supports the following optional suffixes for size values: KB, MB, GB, KiB, MiB, GiB (default 16384) -search.maxQueueDuration duration The maximum time the request waits for execution when -search.maxConcurrentRequests limit is reached; see also -search.maxQueryDuration (default 10s) + -search.maxSamplesPerQuery int + The maximum number of raw samples a single query can process across all time series. This protects from heavy queries, which select unexpectedly high number of raw samples. See also -search.maxSamplesPerSeries (default 1000000000) -search.maxSamplesPerSeries int The maximum number of raw samples a single query can scan per each time series. This option allows limiting memory usage (default 30000000) -search.maxStalenessInterval duration diff --git a/docs/Single-server-VictoriaMetrics.md b/docs/Single-server-VictoriaMetrics.md index 31a1c39b29..9444a63f11 100644 --- a/docs/Single-server-VictoriaMetrics.md +++ b/docs/Single-server-VictoriaMetrics.md @@ -1789,6 +1789,8 @@ Pass `-help` to VictoriaMetrics in order to see the list of supported command-li Supports the following optional suffixes for size values: KB, MB, GB, KiB, MiB, GiB (default 16384) -search.maxQueueDuration duration The maximum time the request waits for execution when -search.maxConcurrentRequests limit is reached; see also -search.maxQueryDuration (default 10s) + -search.maxSamplesPerQuery int + The maximum number of raw samples a single query can process across all time series. This protects from heavy queries, which select unexpectedly high number of raw samples. See also -search.maxSamplesPerSeries (default 1000000000) -search.maxSamplesPerSeries int The maximum number of raw samples a single query can scan per each time series. This option allows limiting memory usage (default 30000000) -search.maxStalenessInterval duration