From a14188dd8ec2dd2527d838f266cb39ab8e2fadeb Mon Sep 17 00:00:00 2001 From: Aliaksandr Valialkin Date: Tue, 28 Jun 2022 20:18:08 +0300 Subject: [PATCH] app/vmselect: expose additional histograms at `/metrics` page, which may help get more insights for the query workload This commit is based on https://github.com/VictoriaMetrics/VictoriaMetrics/pull/2792 --- app/vmselect/netstorage/netstorage.go | 12 ++++++---- app/vmselect/promql/eval.go | 7 +++++- docs/CHANGELOG.md | 6 +++++ docs/keyConcepts.md | 32 +++++++++++++-------------- 4 files changed, 36 insertions(+), 21 deletions(-) diff --git a/app/vmselect/netstorage/netstorage.go b/app/vmselect/netstorage/netstorage.go index 5fe8ded37..09ea78807 100644 --- a/app/vmselect/netstorage/netstorage.go +++ b/app/vmselect/netstorage/netstorage.go @@ -243,12 +243,13 @@ func (rss *Results) RunParallel(qt *querytracer.Tracer, f func(rs *Result, worke // Return just the first error, since other errors are likely duplicate the first error. firstErr = err } + rowsReadPerSeries.Update(float64(tsw.rowsProcessed)) rowsProcessedTotal += tsw.rowsProcessed putTimeseriesWork(tsw) } - perQueryRowsProcessed.Update(float64(rowsProcessedTotal)) - perQuerySeriesProcessed.Update(float64(seriesProcessedTotal)) + rowsReadPerQuery.Update(float64(rowsProcessedTotal)) + seriesReadPerQuery.Update(float64(seriesProcessedTotal)) // Shut down local workers for _, workCh := range workChs { @@ -260,8 +261,11 @@ func (rss *Results) RunParallel(qt *querytracer.Tracer, f func(rs *Result, worke return firstErr } -var perQueryRowsProcessed = metrics.NewHistogram(`vm_per_query_rows_processed_count`) -var perQuerySeriesProcessed = metrics.NewHistogram(`vm_per_query_series_processed_count`) +var ( + rowsReadPerSeries = metrics.NewHistogram(`vm_rows_read_per_series`) + rowsReadPerQuery = metrics.NewHistogram(`vm_rows_read_per_query`) + seriesReadPerQuery = metrics.NewHistogram(`vm_series_read_per_query`) +) var gomaxprocs = cgroup.AvailableCPUs() diff --git a/app/vmselect/promql/eval.go b/app/vmselect/promql/eval.go index 684a83b60..23b003613 100644 --- a/app/vmselect/promql/eval.go +++ b/app/vmselect/promql/eval.go @@ -786,10 +786,13 @@ func evalRollupFuncWithSubquery(qt *querytracer.Tracer, ec *EvalConfig, funcName } return values, timestamps }) + rowsScannedPerQuery.Update(float64(samplesScannedTotal)) qt.Printf("rollup %s() over %d series returned by subquery: series=%d, samplesScanned=%d", funcName, len(tssSQ), len(tss), samplesScannedTotal) return tss, nil } +var rowsScannedPerQuery = metrics.NewHistogram(`vm_rows_scanned_per_query`) + func getKeepMetricNames(expr metricsql.Expr) bool { if ae, ok := expr.(*metricsql.AggrFuncExpr); ok { // Extract rollupFunc(...) from aggrFunc(rollupFunc(...)). @@ -1017,6 +1020,7 @@ func evalRollupWithIncrementalAggregate(qt *querytracer.Tracer, funcName string, return nil, err } tss := iafc.finalizeTimeseries() + rowsScannedPerQuery.Update(float64(samplesScannedTotal)) qt.Printf("series after aggregation with %s(): %d; samplesScanned=%d", iafc.ae.Name, len(tss), samplesScannedTotal) return tss, nil } @@ -1049,10 +1053,11 @@ func evalRollupNoIncrementalAggregate(qt *querytracer.Tracer, funcName string, k } return nil }) - qt.Printf("samplesScanned=%d", samplesScannedTotal) if err != nil { return nil, err } + rowsScannedPerQuery.Update(float64(samplesScannedTotal)) + qt.Printf("samplesScanned=%d", samplesScannedTotal) return tss, nil } diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md index e9e86fcaf..b42696deb 100644 --- a/docs/CHANGELOG.md +++ b/docs/CHANGELOG.md @@ -34,6 +34,12 @@ scrape_configs: * FEATURE: [query tracing](https://docs.victoriametrics.com/Single-server-VictoriaMetrics.html#query-tracing): show timestamps in query traces in human-readable format (aka `RFC3339` in UTC timezone) instead of milliseconds since Unix epoch. For example, `2022-06-27T10:32:54.506Z` instead of `1656325974506`. * FEATURE: improve performance of [/api/v1/series](https://prometheus.io/docs/prometheus/latest/querying/api/#finding-series-by-label-matchers) requests, which return big number of time series. +* FEATURE: expose additional histogram metrics at `http://victoriametrics:8428/metrics`, which may help understanding query workload: + + * `vm_rows_read_per_query` - the number of raw samples read per query. + * `vm_rows_scanned_per_query` - the number of raw samples scanned per query. This number can exceed `vm_rows_read_per_query` if `step` query arg passed to [/api/v1/query_range](https://prometheus.io/docs/prometheus/latest/querying/api/#range-queries) is smaller than the lookbehind window set in square brackets of [rollup function](https://docs.victoriametrics.com/MetricsQL.html#rollup-functions). For example, if `increase(some_metric[1h])` is executed with the `step=5m`, then the same raw samples on a hour time range are scanned `1h/5m=12` times. See [this article](https://valyala.medium.com/how-to-optimize-promql-and-metricsql-queries-85a1b75bf986) for details. + * `vm_rows_read_per_series` - the number of raw samples read per queried series. + * `vm_series_read_per_query` - the number of series read per query. * BUGFIX: [vmalert](https://docs.victoriametrics.com/vmalert.html): allow using `__name__` label (aka [metric name](https://prometheus.io/docs/prometheus/latest/querying/basics/#time-series-selectors)) in alerting annotations. For example `{{ $labels.__name__ }}: Too high connection number for "{{ $labels.instance }}`. * BUGFIX: limit max memory occupied by the cache, which stores parsed regular expressions. Previously too long regular expressions passed in [MetricsQL queries](https://docs.victoriametrics.com/MetricsQL.html) could result in big amounts of used memory (e.g. multiple of gigabytes). Now the max cache size for parsed regexps is limited to a a few megabytes. diff --git a/docs/keyConcepts.md b/docs/keyConcepts.md index 6d893228b..858bd012f 100644 --- a/docs/keyConcepts.md +++ b/docs/keyConcepts.md @@ -131,36 +131,36 @@ functions used with gauges are [aggregation and grouping functions](#aggregation Histogram is a set of [counter](#counter) metrics with different labels for tracking the dispersion and [quantiles](https://prometheus.io/docs/practices/histograms/#quantiles) of the observed value. For example, in VictoriaMetrics we track how many rows is processed per query using the histogram with the -name `vm_per_query_rows_processed_count`. The exposition format for this histogram has the following form: +name `vm_rows_read_per_query`. The exposition format for this histogram has the following form: ``` -vm_per_query_rows_processed_count_bucket{vmrange="4.084e+02...4.642e+02"} 2 -vm_per_query_rows_processed_count_bucket{vmrange="5.275e+02...5.995e+02"} 1 -vm_per_query_rows_processed_count_bucket{vmrange="8.799e+02...1.000e+03"} 1 -vm_per_query_rows_processed_count_bucket{vmrange="1.468e+03...1.668e+03"} 3 -vm_per_query_rows_processed_count_bucket{vmrange="1.896e+03...2.154e+03"} 4 -vm_per_query_rows_processed_count_sum 15582 -vm_per_query_rows_processed_count_count 11 +vm_rows_read_per_query_bucket{vmrange="4.084e+02...4.642e+02"} 2 +vm_rows_read_per_query_bucket{vmrange="5.275e+02...5.995e+02"} 1 +vm_rows_read_per_query_bucket{vmrange="8.799e+02...1.000e+03"} 1 +vm_rows_read_per_query_bucket{vmrange="1.468e+03...1.668e+03"} 3 +vm_rows_read_per_query_bucket{vmrange="1.896e+03...2.154e+03"} 4 +vm_rows_read_per_query_sum 15582 +vm_rows_read_per_query_count 11 ``` -In practice, histogram `vm_per_query_rows_processed_count` may be used in the following way: +In practice, histogram `vm_rows_read_per_query` may be used in the following way: ```go // define the histogram -perQueryRowsProcessed := metrics.NewHistogram(`vm_per_query_rows_processed_count`) +rowsReadPerQuery := metrics.NewHistogram(`vm_rows_read_per_query`) // use the histogram during processing for _, query := range queries { - perQueryRowsProcessed.Update(len(query.Rows)) + rowsReadPerQuery.Update(float64(len(query.Rows))) } ``` -Now let's see what happens each time when `perQueryRowsProcessed.Update` is called: +Now let's see what happens each time when `rowsReadPerQuery.Update` is called: -* counter `vm_per_query_rows_processed_count_sum` increments by value of `len(query.Rows)` expression and accounts for +* counter `vm_rows_read_per_query_sum` increments by value of `len(query.Rows)` expression and accounts for total sum of all observed values; -* counter `vm_per_query_rows_processed_count_count` increments by 1 and accounts for total number of observations; -* counter `vm_per_query_rows_processed_count_bucket` gets incremented only if observed value is within the +* counter `vm_rows_read_per_query_count` increments by 1 and accounts for total number of observations; +* counter `vm_rows_read_per_query_bucket` gets incremented only if observed value is within the range (`bucket`) defined in `vmrange`. Such a combination of `counter` metrics allows @@ -823,4 +823,4 @@ details [here](https://docs.victoriametrics.com/Single-server-VictoriaMetrics.ht ### Deduplication VictoriaMetrics supports data points deduplication after data was written to the storage. See more -details [here](https://docs.victoriametrics.com/Single-server-VictoriaMetrics.html#deduplication). \ No newline at end of file +details [here](https://docs.victoriametrics.com/Single-server-VictoriaMetrics.html#deduplication).