diff --git a/README.md b/README.md index 31498b12e..c1d0f47e7 100644 --- a/README.md +++ b/README.md @@ -1357,6 +1357,7 @@ By default VictoriaMetrics is tuned for an optimal resource usage under typical - `-search.maxSamplesPerQuery` limits the number of raw samples a single query can process. This allows limiting CPU usage for heavy queries. - `-search.maxPointsPerTimeseries` limits the number of calculated points, which can be returned per each matching time series from [range query](https://docs.victoriametrics.com/keyConcepts.html#range-query). - `-search.maxPointsSubqueryPerTimeseries` limits the number of calculated points, which can be generated per each matching time series during [subquery](https://docs.victoriametrics.com/MetricsQL.html#subqueries) evaluation. +- `-search.maxSeriesPerAggrFunc` limits the number of time series, which can be generated by [MetricsQL aggregate functions](https://docs.victoriametrics.com/MetricsQL.html#aggregate-functions) in a single query. - `-search.maxSeries` limits the number of time series, which may be returned from [/api/v1/series](https://prometheus.io/docs/prometheus/latest/querying/api/#finding-series-by-label-matchers). This endpoint is used mostly by Grafana for auto-completion of metric names, label names and label values. Queries to this endpoint may take big amounts of CPU time and memory when the database contains big number of unique time series because of [high churn rate](https://docs.victoriametrics.com/FAQ.html#what-is-high-churn-rate). In this case it might be useful to set the `-search.maxSeries` to quite low value in order limit CPU and memory usage. - `-search.maxTagKeys` limits the number of items, which may be returned from [/api/v1/labels](https://prometheus.io/docs/prometheus/latest/querying/api/#getting-label-names). This endpoint is used mostly by Grafana for auto-completion of label names. Queries to this endpoint may take big amounts of CPU time and memory when the database contains big number of unique time series because of [high churn rate](https://docs.victoriametrics.com/FAQ.html#what-is-high-churn-rate). In this case it might be useful to set the `-search.maxTagKeys` to quite low value in order to limit CPU and memory usage. - `-search.maxTagValues` limits the number of items, which may be returned from [/api/v1/label/.../values](https://prometheus.io/docs/prometheus/latest/querying/api/#querying-label-values). This endpoint is used mostly by Grafana for auto-completion of label values. Queries to this endpoint may take big amounts of CPU time and memory when the database contains big number of unique time series because of [high churn rate](https://docs.victoriametrics.com/FAQ.html#what-is-high-churn-rate). In this case it might be useful to set the `-search.maxTagValues` to quite low value in order to limit CPU and memory usage. diff --git a/app/vmselect/promql/aggr.go b/app/vmselect/promql/aggr.go index 802411ef0..98fb1a6a9 100644 --- a/app/vmselect/promql/aggr.go +++ b/app/vmselect/promql/aggr.go @@ -1,6 +1,7 @@ package promql import ( + "flag" "fmt" "math" "sort" @@ -15,6 +16,8 @@ import ( "github.com/cespare/xxhash/v2" ) +var maxSeriesPerAggrFunc = flag.Int("search.maxSeriesPerAggrFunc", 1e6, "The maximum number of time series an aggregate MetricsQL function can generate") + var aggrFuncs = map[string]aggrFunc{ "any": aggrFuncAny, "avg": newAggrFunc(aggrFuncAvg), @@ -106,6 +109,16 @@ func removeGroupTags(metricName *storage.MetricName, modifier *metricsql.Modifie func aggrFuncExt(afe func(tss []*timeseries, modifier *metricsql.ModifierExpr) []*timeseries, argOrig []*timeseries, modifier *metricsql.ModifierExpr, maxSeries int, keepOriginal bool) ([]*timeseries, error) { + m := aggrPrepareSeries(argOrig, modifier, maxSeries, keepOriginal) + rvs := make([]*timeseries, 0, len(m)) + for _, tss := range m { + rv := afe(tss, modifier) + rvs = append(rvs, rv...) + } + return rvs, nil +} + +func aggrPrepareSeries(argOrig []*timeseries, modifier *metricsql.ModifierExpr, maxSeries int, keepOriginal bool) map[string][]*timeseries { // Remove empty time series, e.g. series with all NaN samples, // since such series are ignored by aggregate functions. argOrig = removeEmptySeries(argOrig) @@ -130,21 +143,7 @@ func aggrFuncExt(afe func(tss []*timeseries, modifier *metricsql.ModifierExpr) [ m[k] = tss } bbPool.Put(bb) - - srcTssCount := 0 - dstTssCount := 0 - rvs := make([]*timeseries, 0, len(m)) - for _, tss := range m { - rv := afe(tss, modifier) - rvs = append(rvs, rv...) - srcTssCount += len(tss) - dstTssCount += len(rv) - if dstTssCount > 2000 && dstTssCount > 16*srcTssCount { - // This looks like count_values explosion. - return nil, fmt.Errorf(`too many timeseries after aggragation; got %d; want less than %d`, dstTssCount, 16*srcTssCount) - } - } - return rvs, nil + return m } func aggrFuncAny(afa *aggrFuncArg) ([]*timeseries, error) { @@ -588,46 +587,56 @@ func aggrFuncCountValues(afa *aggrFuncArg) ([]*timeseries, error) { // Do nothing } - afe := func(tss []*timeseries, modififer *metricsql.ModifierExpr) []*timeseries { - m := make(map[float64]bool) + afe := func(tss []*timeseries, modififer *metricsql.ModifierExpr) ([]*timeseries, error) { + m := make(map[float64]*timeseries) for _, ts := range tss { - for _, v := range ts.Values { + for i, v := range ts.Values { if math.IsNaN(v) { continue } - m[v] = true - } - } - values := make([]float64, 0, len(m)) - for v := range m { - values = append(values, v) - } - sort.Float64s(values) - - var rvs []*timeseries - for _, v := range values { - var dst timeseries - dst.CopyFromShallowTimestamps(tss[0]) - dst.MetricName.RemoveTag(dstLabel) - dst.MetricName.AddTag(dstLabel, strconv.FormatFloat(v, 'f', -1, 64)) - for i := range dst.Values { - count := 0 - for _, ts := range tss { - if ts.Values[i] == v { - count++ + dst := m[v] + if dst == nil { + if len(m) >= *maxSeriesPerAggrFunc { + return nil, fmt.Errorf("more than -search.maxSeriesPerAggrFunc=%d are generated by count_values()", *maxSeriesPerAggrFunc) } + dst = ×eries{} + dst.CopyFromShallowTimestamps(tss[0]) + dst.MetricName.RemoveTag(dstLabel) + dst.MetricName.AddTag(dstLabel, strconv.FormatFloat(v, 'f', -1, 64)) + values := dst.Values + for j := range values { + values[j] = nan + } + m[v] = dst } - n := float64(count) - if n == 0 { - n = nan + values := dst.Values + if math.IsNaN(values[i]) { + values[i] = 1 + } else { + values[i]++ } - dst.Values[i] = n } - rvs = append(rvs, &dst) } - return rvs + rvs := make([]*timeseries, 0, len(m)) + for _, ts := range m { + rvs = append(rvs, ts) + } + return rvs, nil } - return aggrFuncExt(afe, args[1], &afa.ae.Modifier, afa.ae.Limit, false) + + m := aggrPrepareSeries(args[1], &afa.ae.Modifier, afa.ae.Limit, false) + rvs := make([]*timeseries, 0, len(m)) + for _, tss := range m { + rv, err := afe(tss, modifier) + if err != nil { + return nil, err + } + rvs = append(rvs, rv...) + if len(rvs) > *maxSeriesPerAggrFunc { + return nil, fmt.Errorf("more than -search.maxSeriesPerAggrFunc=%d are generated by count_values()", *maxSeriesPerAggrFunc) + } + } + return rvs, nil } func newAggrFuncTopK(isReverse bool) aggrFunc { diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md index 3dc75047a..1ef1a7c04 100644 --- a/docs/CHANGELOG.md +++ b/docs/CHANGELOG.md @@ -35,6 +35,7 @@ The following tip changes can be tested by building VictoriaMetrics components f * FEATURE: [MetricsQL](https://docs.victoriametrics.com/MetricsQL.html): add `range_trim_outliers(k, q)` function for dropping outliers located farther than `k*range_mad(q)` from the `range_median(q)`. This should help removing outliers during query time at [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3759). * FEATURE: [MetricsQL](https://docs.victoriametrics.com/MetricsQL.html): add `range_trim_zscore(z, q)` function for dropping outliers located farther than `z*range_stddev(q)` from `range_avg(q)`. This should help removing outliers during query time at [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3759). * FEATURE: [vmui](https://docs.victoriametrics.com/#vmui): show `median` instead of `avg` in graph tooltip and line legend, since `median` is more tolerant against spikes. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3706). +* FEATURE: add `-search.maxSeriesPerAggrFunc` command-line flag, which can be used for limiting the number of time series [MetricsQL aggregate functions](https://docs.victoriametrics.com/MetricsQL.html#aggregate-functions) can return in a single query. This flag can be useful for preventing OOMs when [count_values](https://docs.victoriametrics.com/MetricsQL.html#count_values) function is improperly used. * FEATURE: [vmui](https://docs.victoriametrics.com/#vmui): small UX improvements for mobile view. See [this feature request](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3707) and [this pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/3848). * FEATURE: add `-search.logQueryMemoryUsage` command-line flag for logging queries, which need more memory than specified by this command-line flag. See [this feature request](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3553). Thanks to @michal-kralik for the idea and the intial implementation. diff --git a/docs/Cluster-VictoriaMetrics.md b/docs/Cluster-VictoriaMetrics.md index 435fba771..4fcbac798 100644 --- a/docs/Cluster-VictoriaMetrics.md +++ b/docs/Cluster-VictoriaMetrics.md @@ -584,6 +584,7 @@ Some workloads may need fine-grained resource usage limits. In these cases the f matching time series from [range query](https://docs.victoriametrics.com/keyConcepts.html#range-query). - `-search.maxPointsSubqueryPerTimeseries` limits the number of calculated points, which can be generated per each matching time series during [subquery](https://docs.victoriametrics.com/MetricsQL.html#subqueries) evaluation. +- `-search.maxSeriesPerAggrFunc` limits the number of time series, which can be generated by [MetricsQL aggregate functions](https://docs.victoriametrics.com/MetricsQL.html#aggregate-functions) in a single query. - `-search.maxSeries` at `vmselect` limits the number of time series, which may be returned from [/api/v1/series](https://prometheus.io/docs/prometheus/latest/querying/api/#finding-series-by-label-matchers). This endpoint is used mostly by Grafana for auto-completion of metric names, label names and label values. diff --git a/docs/README.md b/docs/README.md index bbf5c019a..632875f8a 100644 --- a/docs/README.md +++ b/docs/README.md @@ -1358,6 +1358,7 @@ By default VictoriaMetrics is tuned for an optimal resource usage under typical - `-search.maxSamplesPerQuery` limits the number of raw samples a single query can process. This allows limiting CPU usage for heavy queries. - `-search.maxPointsPerTimeseries` limits the number of calculated points, which can be returned per each matching time series from [range query](https://docs.victoriametrics.com/keyConcepts.html#range-query). - `-search.maxPointsSubqueryPerTimeseries` limits the number of calculated points, which can be generated per each matching time series during [subquery](https://docs.victoriametrics.com/MetricsQL.html#subqueries) evaluation. +- `-search.maxSeriesPerAggrFunc` limits the number of time series, which can be generated by [MetricsQL aggregate functions](https://docs.victoriametrics.com/MetricsQL.html#aggregate-functions) in a single query. - `-search.maxSeries` limits the number of time series, which may be returned from [/api/v1/series](https://prometheus.io/docs/prometheus/latest/querying/api/#finding-series-by-label-matchers). This endpoint is used mostly by Grafana for auto-completion of metric names, label names and label values. Queries to this endpoint may take big amounts of CPU time and memory when the database contains big number of unique time series because of [high churn rate](https://docs.victoriametrics.com/FAQ.html#what-is-high-churn-rate). In this case it might be useful to set the `-search.maxSeries` to quite low value in order limit CPU and memory usage. - `-search.maxTagKeys` limits the number of items, which may be returned from [/api/v1/labels](https://prometheus.io/docs/prometheus/latest/querying/api/#getting-label-names). This endpoint is used mostly by Grafana for auto-completion of label names. Queries to this endpoint may take big amounts of CPU time and memory when the database contains big number of unique time series because of [high churn rate](https://docs.victoriametrics.com/FAQ.html#what-is-high-churn-rate). In this case it might be useful to set the `-search.maxTagKeys` to quite low value in order to limit CPU and memory usage. - `-search.maxTagValues` limits the number of items, which may be returned from [/api/v1/label/.../values](https://prometheus.io/docs/prometheus/latest/querying/api/#querying-label-values). This endpoint is used mostly by Grafana for auto-completion of label values. Queries to this endpoint may take big amounts of CPU time and memory when the database contains big number of unique time series because of [high churn rate](https://docs.victoriametrics.com/FAQ.html#what-is-high-churn-rate). In this case it might be useful to set the `-search.maxTagValues` to quite low value in order to limit CPU and memory usage. diff --git a/docs/Single-server-VictoriaMetrics.md b/docs/Single-server-VictoriaMetrics.md index 336048257..28fbf2f3c 100644 --- a/docs/Single-server-VictoriaMetrics.md +++ b/docs/Single-server-VictoriaMetrics.md @@ -1361,6 +1361,7 @@ By default VictoriaMetrics is tuned for an optimal resource usage under typical - `-search.maxSamplesPerQuery` limits the number of raw samples a single query can process. This allows limiting CPU usage for heavy queries. - `-search.maxPointsPerTimeseries` limits the number of calculated points, which can be returned per each matching time series from [range query](https://docs.victoriametrics.com/keyConcepts.html#range-query). - `-search.maxPointsSubqueryPerTimeseries` limits the number of calculated points, which can be generated per each matching time series during [subquery](https://docs.victoriametrics.com/MetricsQL.html#subqueries) evaluation. +- `-search.maxSeriesPerAggrFunc` limits the number of time series, which can be generated by [MetricsQL aggregate functions](https://docs.victoriametrics.com/MetricsQL.html#aggregate-functions) in a single query. - `-search.maxSeries` limits the number of time series, which may be returned from [/api/v1/series](https://prometheus.io/docs/prometheus/latest/querying/api/#finding-series-by-label-matchers). This endpoint is used mostly by Grafana for auto-completion of metric names, label names and label values. Queries to this endpoint may take big amounts of CPU time and memory when the database contains big number of unique time series because of [high churn rate](https://docs.victoriametrics.com/FAQ.html#what-is-high-churn-rate). In this case it might be useful to set the `-search.maxSeries` to quite low value in order limit CPU and memory usage. - `-search.maxTagKeys` limits the number of items, which may be returned from [/api/v1/labels](https://prometheus.io/docs/prometheus/latest/querying/api/#getting-label-names). This endpoint is used mostly by Grafana for auto-completion of label names. Queries to this endpoint may take big amounts of CPU time and memory when the database contains big number of unique time series because of [high churn rate](https://docs.victoriametrics.com/FAQ.html#what-is-high-churn-rate). In this case it might be useful to set the `-search.maxTagKeys` to quite low value in order to limit CPU and memory usage. - `-search.maxTagValues` limits the number of items, which may be returned from [/api/v1/label/.../values](https://prometheus.io/docs/prometheus/latest/querying/api/#querying-label-values). This endpoint is used mostly by Grafana for auto-completion of label values. Queries to this endpoint may take big amounts of CPU time and memory when the database contains big number of unique time series because of [high churn rate](https://docs.victoriametrics.com/FAQ.html#what-is-high-churn-rate). In this case it might be useful to set the `-search.maxTagValues` to quite low value in order to limit CPU and memory usage.