From bf01a97f17ec87cb87dc003b48c541e5e4772d53 Mon Sep 17 00:00:00 2001 From: Aliaksandr Valialkin Date: Thu, 2 Nov 2023 19:47:36 +0100 Subject: [PATCH] docs/CHANGELOG.md: update the description of the optimization for SLO/SLI-like queries according to latest changes See commits 4497a08e3d5b4bc094c1d96ae93c99a1593885b3 and 92826b0b4a414e84410068d47639362f54f4fe87 --- README.md | 3 +++ app/vmselect/promql/eval.go | 4 ++-- docs/CHANGELOG.md | 12 ++++++++---- docs/Cluster-VictoriaMetrics.md | 3 +++ docs/README.md | 3 +++ docs/Single-server-VictoriaMetrics.md | 3 +++ lib/flagutil/duration.go | 3 ++- 7 files changed, 24 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index a6d2fc5a94..79da11dccd 100644 --- a/README.md +++ b/README.md @@ -1264,6 +1264,9 @@ Below is the output for `/path/to/vmselect -help`: The maximum number of CPU cores a single query can use. The default value should work good for most cases. The flag can be set to lower values for improving performance of big number of concurrently executed queries. The flag can be set to bigger values for improving performance of heavy queries, which scan big number of time series (>10K) and/or big number of samples (>100M). There is no sense in setting this flag to values bigger than the number of CPU cores available on the system (default 4) -search.minStalenessInterval duration The minimum interval for staleness calculations. This flag could be useful for removing gaps on graphs generated from time series with irregular intervals between samples. See also '-search.maxStalenessInterval' + -search.minWindowForInstantRollupOptimization value + Enable cache-based optimization for repeated queries to /api/v1/query (aka instant queries), which contain rollup functions with lookbehind window exceeding the given value + The following optional suffixes are supported: s (second), m (minute), h (hour), d (day), w (week), y (year). If suffix isn't set, then the duration is counted in months (default 6h) -search.noStaleMarkers Set this flag to true if the database doesn't contain Prometheus stale markers, so there is no need in spending additional CPU time on its handling. Staleness markers may exist only in data obtained from Prometheus scrape targets -search.queryStats.lastQueriesCount int diff --git a/app/vmselect/promql/eval.go b/app/vmselect/promql/eval.go index ee9cb8dd62..669e9fb726 100644 --- a/app/vmselect/promql/eval.go +++ b/app/vmselect/promql/eval.go @@ -42,8 +42,8 @@ var ( "See also -search.logSlowQueryDuration and -search.maxMemoryPerQuery") noStaleMarkers = flag.Bool("search.noStaleMarkers", false, "Set this flag to true if the database doesn't contain Prometheus stale markers, "+ "so there is no need in spending additional CPU time on its handling. Staleness markers may exist only in data obtained from Prometheus scrape targets") - minWindowForInstantRollupOptimization = flag.Duration("search.minWindowForInstantRollupOptimization", 6*time.Hour, "Enable optimization for queries to /api/v1/query "+ - "(aka instant queries), which contain rollup functions with lookbehind window exceeding the given value") + minWindowForInstantRollupOptimization = flagutil.NewDuration("search.minWindowForInstantRollupOptimization", "6h", "Enable cache-based optimization for repeated queries "+ + "to /api/v1/query (aka instant queries), which contain rollup functions with lookbehind window exceeding the given value") ) // The minimum number of points per timeseries for enabling time rounding. diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md index e705446110..8f7c149c5d 100644 --- a/docs/CHANGELOG.md +++ b/docs/CHANGELOG.md @@ -33,13 +33,17 @@ The sandbox cluster installation is running under the constant load generated by * SECURITY: upgrade Go builder from Go1.21.1 to Go1.21.3. See [the list of issues addressed in Go1.21.2](https://github.com/golang/go/issues?q=milestone%3AGo1.21.2+label%3ACherryPickApproved) and [the list of issues addressed in Go1.21.3](https://github.com/golang/go/issues?q=milestone%3AGo1.21.3+label%3ACherryPickApproved). -* FEATURE: `vmselect`: improve performance for repeated [instant queries](https://docs.victoriametrics.com/keyConcepts.html#instant-query) if they contain one of the following [rollup functions](https://docs.victoriametrics.com/MetricsQL.html#rollup-functions) with lookbehind window in square brackets bigger or equal to 1 day: - - [sum_over_time](https://docs.victoriametrics.com/MetricsQL.html#sum_over_time) - - [count_over_time](https://docs.victoriametrics.com/MetricsQL.html#count_over_time) +* FEATURE: `vmselect`: improve performance for repeated [instant queries](https://docs.victoriametrics.com/keyConcepts.html#instant-query) if they contain one of the following [rollup functions](https://docs.victoriametrics.com/MetricsQL.html#rollup-functions): - [avg_over_time](https://docs.victoriametrics.com/MetricsQL.html#avg_over_time) + - [sum_over_time](https://docs.victoriametrics.com/MetricsQL.html#sum_over_time) + - [count_eq_over_time](https://docs.victoriametrics.com/MetricsQL.html#count_eq_over_time) + - [count_gt_over_time](https://docs.victoriametrics.com/MetricsQL.html#count_gt_over_time) + - [count_le_over_time](https://docs.victoriametrics.com/MetricsQL.html#count_le_over_time) + - [count_ne_over_time](https://docs.victoriametrics.com/MetricsQL.html#count_ne_over_time) + - [count_over_time](https://docs.victoriametrics.com/MetricsQL.html#count_over_time) - [increase](https://docs.victoriametrics.com/MetricsQL.html#increase) - [rate](https://docs.victoriametrics.com/MetricsQL.html#rate) - These functions are usually used in SLO/SLI queries such as `avg_over_time(up[30d])` or `sum(rate(http_request_errors_total[3d])) / sum(rate(http_requests_total[3d]))`. + The optimization is enabled when these functions contain lookbehind window in square brackets bigger or equal to `6h` (the threshold can be changed via `-search.minWindowForInstantRollupOptimization` command-line flag). The optimization improves performance for SLO/SLI-like queries such as `avg_over_time(up[30d])` or `sum(rate(http_request_errors_total[3d])) / sum(rate(http_requests_total[3d]))`, which can be generated by [sloth](https://github.com/slok/sloth) or similar projects. * FEATURE: `vmselect`: improve query performance on systems with big number of CPU cores (`>=32`). Add `-search.maxWorkersPerQuery` command-line flag, which can be used for fine-tuning query performance on systems with big number of CPU cores. See [this pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5195). * FEATURE: `vmselect`: expose `vm_memory_intensive_queries_total` counter metric which gets increased each time `-search.logQueryMemoryUsage` memory limit is exceeded by a query. This metric should help to identify expensive and heavy queries without inspecting the logs. * FEATURE: [MetricsQL](https://docs.victoriametrics.com/MetricsQL.html): add [drop_empty_series()](https://docs.victoriametrics.com/MetricsQL.html#drop_empty_series) function, which can be used for filtering out empty series before performing additional calculations as shown in [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5071). diff --git a/docs/Cluster-VictoriaMetrics.md b/docs/Cluster-VictoriaMetrics.md index 61a5487dbb..df7efe548b 100644 --- a/docs/Cluster-VictoriaMetrics.md +++ b/docs/Cluster-VictoriaMetrics.md @@ -1275,6 +1275,9 @@ Below is the output for `/path/to/vmselect -help`: The maximum number of CPU cores a single query can use. The default value should work good for most cases. The flag can be set to lower values for improving performance of big number of concurrently executed queries. The flag can be set to bigger values for improving performance of heavy queries, which scan big number of time series (>10K) and/or big number of samples (>100M). There is no sense in setting this flag to values bigger than the number of CPU cores available on the system (default 4) -search.minStalenessInterval duration The minimum interval for staleness calculations. This flag could be useful for removing gaps on graphs generated from time series with irregular intervals between samples. See also '-search.maxStalenessInterval' + -search.minWindowForInstantRollupOptimization value + Enable cache-based optimization for repeated queries to /api/v1/query (aka instant queries), which contain rollup functions with lookbehind window exceeding the given value + The following optional suffixes are supported: s (second), m (minute), h (hour), d (day), w (week), y (year). If suffix isn't set, then the duration is counted in months (default 6h) -search.noStaleMarkers Set this flag to true if the database doesn't contain Prometheus stale markers, so there is no need in spending additional CPU time on its handling. Staleness markers may exist only in data obtained from Prometheus scrape targets -search.queryStats.lastQueriesCount int diff --git a/docs/README.md b/docs/README.md index beaa1d6875..f0d38a5f89 100644 --- a/docs/README.md +++ b/docs/README.md @@ -2870,6 +2870,9 @@ Pass `-help` to VictoriaMetrics in order to see the list of supported command-li The maximum number of CPU cores a single query can use. The default value should work good for most cases. The flag can be set to lower values for improving performance of big number of concurrently executed queries. The flag can be set to bigger values for improving performance of heavy queries, which scan big number of time series (>10K) and/or big number of samples (>100M). There is no sense in setting this flag to values bigger than the number of CPU cores available on the system (default 4) -search.minStalenessInterval duration The minimum interval for staleness calculations. This flag could be useful for removing gaps on graphs generated from time series with irregular intervals between samples. See also '-search.maxStalenessInterval' + -search.minWindowForInstantRollupOptimization value + Enable cache-based optimization for repeated queries to /api/v1/query (aka instant queries), which contain rollup functions with lookbehind window exceeding the given value + The following optional suffixes are supported: s (second), m (minute), h (hour), d (day), w (week), y (year). If suffix isn't set, then the duration is counted in months (default 6h) -search.noStaleMarkers Set this flag to true if the database doesn't contain Prometheus stale markers, so there is no need in spending additional CPU time on its handling. Staleness markers may exist only in data obtained from Prometheus scrape targets -search.queryStats.lastQueriesCount int diff --git a/docs/Single-server-VictoriaMetrics.md b/docs/Single-server-VictoriaMetrics.md index 7504920cf3..b69dbd4cd0 100644 --- a/docs/Single-server-VictoriaMetrics.md +++ b/docs/Single-server-VictoriaMetrics.md @@ -2878,6 +2878,9 @@ Pass `-help` to VictoriaMetrics in order to see the list of supported command-li The maximum number of CPU cores a single query can use. The default value should work good for most cases. The flag can be set to lower values for improving performance of big number of concurrently executed queries. The flag can be set to bigger values for improving performance of heavy queries, which scan big number of time series (>10K) and/or big number of samples (>100M). There is no sense in setting this flag to values bigger than the number of CPU cores available on the system (default 4) -search.minStalenessInterval duration The minimum interval for staleness calculations. This flag could be useful for removing gaps on graphs generated from time series with irregular intervals between samples. See also '-search.maxStalenessInterval' + -search.minWindowForInstantRollupOptimization value + Enable cache-based optimization for repeated queries to /api/v1/query (aka instant queries), which contain rollup functions with lookbehind window exceeding the given value + The following optional suffixes are supported: s (second), m (minute), h (hour), d (day), w (week), y (year). If suffix isn't set, then the duration is counted in months (default 6h) -search.noStaleMarkers Set this flag to true if the database doesn't contain Prometheus stale markers, so there is no need in spending additional CPU time on its handling. Staleness markers may exist only in data obtained from Prometheus scrape targets -search.queryStats.lastQueriesCount int diff --git a/lib/flagutil/duration.go b/lib/flagutil/duration.go index e33d1fc132..338d1432cf 100644 --- a/lib/flagutil/duration.go +++ b/lib/flagutil/duration.go @@ -14,7 +14,8 @@ import ( // // DefaultValue is in months. func NewDuration(name string, defaultValue string, description string) *Duration { - description += "\nThe following optional suffixes are supported: h (hour), d (day), w (week), y (year). If suffix isn't set, then the duration is counted in months" + description += "\nThe following optional suffixes are supported: s (second), m (minute), h (hour), d (day), w (week), y (year). "+ + "If suffix isn't set, then the duration is counted in months" d := &Duration{} if err := d.Set(defaultValue); err != nil { panic(fmt.Sprintf("BUG: can not parse default value %s for flag %s", defaultValue, name))