From 509339bf63d081e9642e44f99b0de26cd6b61454 Mon Sep 17 00:00:00 2001 From: Aliaksandr Valialkin Date: Wed, 6 Dec 2023 14:15:48 +0200 Subject: [PATCH] app/vmselect: properly adjust the lower bound for the time range where raw samples must be selected for default_rollup() function Previously the lower bound could be too small, which could result in missing values at the beginning of the graph for default_rollup() function. This function is automatically applied to all the series selectors if they aren't explicitly wrapped into a rollup function - see https://docs.victoriametrics.com/MetricsQL.html#implicit-query-conversions While at it, properly take into account `-search.minStalenessInterval` command-line flag when adjusting the lower bound for the selected time range. Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5388 --- app/vmselect/promql/eval.go | 23 +++++++++++++++++------ app/vmselect/promql/rollup.go | 5 +---- docs/CHANGELOG.md | 1 + 3 files changed, 19 insertions(+), 10 deletions(-) diff --git a/app/vmselect/promql/eval.go b/app/vmselect/promql/eval.go index 294911e993..a5414155d5 100644 --- a/app/vmselect/promql/eval.go +++ b/app/vmselect/promql/eval.go @@ -945,7 +945,7 @@ func evalRollupFuncWithSubquery(qt *querytracer.Tracer, ec *EvalConfig, funcName } ecSQ := copyEvalConfig(ec) - ecSQ.Start -= window + maxSilenceInterval + step + ecSQ.Start -= window + step + maxSilenceInterval() ecSQ.End += step ecSQ.Step = step ecSQ.MaxPointsPerSeries = *maxPointsSubqueryPerTimeseries @@ -1699,12 +1699,12 @@ func evalRollupFuncNoCache(qt *querytracer.Tracer, ec *EvalConfig, funcName stri return nil, err } - // Fetch the remaining part of the result. + // Fetch the result. tfss := searchutils.ToTagFilterss(me.LabelFilterss) tfss = searchutils.JoinTagFilterss(tfss, ec.EnforcedTagFilterss) minTimestamp := ec.Start if needSilenceIntervalForRollupFunc(funcName) { - minTimestamp -= maxSilenceInterval + minTimestamp -= maxSilenceInterval() } if window > ec.Step { minTimestamp -= window @@ -1800,10 +1800,22 @@ func getRollupMemoryLimiter() *memoryLimiter { return &rollupMemoryLimiter } +func maxSilenceInterval() int64 { + d := minStalenessInterval.Milliseconds() + if d <= 0 { + d = 5 * 60 * 1000 + } + return d +} + func needSilenceIntervalForRollupFunc(funcName string) bool { - // All rollup the functions, which do not rely on the previous sample - // before the lookbehind window (aka prevValue), do not need silence interval. + // All the rollup functions, which do not rely on the previous sample + // before the lookbehind window (aka prevValue and realPrevValue), do not need silence interval. switch strings.ToLower(funcName) { + case "default_rollup": + // The default_rollup implicitly relies on the previous samples in order to fill gaps. + // See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5388 + return true case "absent_over_time", "avg_over_time", @@ -1812,7 +1824,6 @@ func needSilenceIntervalForRollupFunc(funcName string) bool { "count_le_over_time", "count_ne_over_time", "count_over_time", - "default_rollup", "first_over_time", "histogram_over_time", "hoeffding_bound_lower", diff --git a/app/vmselect/promql/rollup.go b/app/vmselect/promql/rollup.go index d10c40a861..d700cd4cd5 100644 --- a/app/vmselect/promql/rollup.go +++ b/app/vmselect/promql/rollup.go @@ -561,9 +561,6 @@ var ( inf = math.Inf(1) ) -// The maximum interval without previous rows. -const maxSilenceInterval = 5 * 60 * 1000 - type timeseriesMap struct { origin *timeseries h metrics.Histogram @@ -620,7 +617,7 @@ func (tsm *timeseriesMap) GetOrCreateTimeseries(labelName, labelValue string) *t // // rc.Timestamps are used as timestamps for dstValues. // -// timestamps must cover time range [rc.Start - rc.Window - maxSilenceInterval ... rc.End]. +// It is expected that timestamps cover the time range [rc.Start - rc.Window ... rc.End]. // // Do cannot be called from concurrent goroutines. func (rc *rollupConfig) Do(dstValues []float64, values []float64, timestamps []int64) ([]float64, uint64) { diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md index 125fee25bd..6bc7279801 100644 --- a/docs/CHANGELOG.md +++ b/docs/CHANGELOG.md @@ -49,6 +49,7 @@ The sandbox cluster installation is running under the constant load generated by * `go_scavenge_cpu_seconds_total` - the [counter](https://docs.victoriametrics.com/keyConcepts.html#counter), which shows the total CPU time spent by Go runtime for returning memory to the Operating System. * `go_memlimit_bytes` - the value of [GOMEMLIMIT](https://pkg.go.dev/runtime#hdr-Environment_Variables) environment variable. +* BUGFIX: [MetricsQL](https://docs.victoriametrics.com/MetricsQL.html): properly calculate values for the first point on the graph for queries, which do not use [rollup functions](https://docs.victoriametrics.com/MetricsQL.html#rollup-functions). For example, previously `count(up)` could return lower than expected values for the first point on the graph. This also could result in lower than expected values in the middle of the graph like in [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5388) when the response caching isn't disabled. The issue has been introduced in [v1.95.0](https://docs.victoriametrics.com/CHANGELOG.html#v1950). * BUGFIX: [vmagent](https://docs.victoriametrics.com/vmagent.html): prevent from `FATAL: cannot flush metainfo` panic when [`-remoteWrite.multitenantURL`](https://docs.victoriametrics.com/vmagent.html#multitenancy) command-line flag is set. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5357). * BUGFIX: [vmagent](https://docs.victoriametrics.com/vmagent.html): properly decode zstd-encoded data blocks received via [VictoriaMetrics remote_write protocol](https://docs.victoriametrics.com/vmagent.html#victoriametrics-remote-write-protocol). See [this issue comment](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5301#issuecomment-1815871992). * BUGFIX: [vmagent](https://docs.victoriametrics.com/vmagent.html): properly add new labels at `output_relabel_configs` during [stream aggregation](https://docs.victoriametrics.com/stream-aggregation.html). Previously this could lead to corrupted labels in output samples. Thanks to @ChengChung for providing [detailed report for this bug](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5402).