app/vmselect/promql: properly handle partial counter resets in rate(), irate(), increase() and remove_resets() functions

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/2787
2024-12-14 16:12:15 +01:00 · 2022-06-30 22:39:38 +03:00 · 2022-06-30 22:39:38 +03:00 · fa08220d27
commit fa08220d27
parent 1e6b0a1f54
5 changed files with 32 additions and 15 deletions
--- a/app/vmselect/promql/exec_test.go
+++ b/app/vmselect/promql/exec_test.go
@ -6568,7 +6568,7 @@ func TestExecSuccess(t *testing.T) {
 		q := `rate((2000-time())[100s:100s])`
 		r := netstorage.Result{
 			MetricName: metricNameExpected,
-			Values:     []float64{5.5, 4.5, 6.5, 4.5, 2.5, 0.5},
+			Values:     []float64{0, 0, 6.5, 4.5, 2.5, 0.5},
 			Timestamps: timestampsExpected,
 		}
 		resultExpected := []netstorage.Result{r}
@ -6579,7 +6579,7 @@ func TestExecSuccess(t *testing.T) {
 		q := `rate((2000-time())[100s:100s] offset 100s)`
 		r := netstorage.Result{
 			MetricName: metricNameExpected,
-			Values:     []float64{6, 5, 7.5, 5.5, 3.5, 1.5},
+			Values:     []float64{0, 0, 3.5, 5.5, 3.5, 1.5},
 			Timestamps: timestampsExpected,
 		}
 		resultExpected := []netstorage.Result{r}
@ -6590,7 +6590,7 @@ func TestExecSuccess(t *testing.T) {
 		q := `rate((2000-time())[100s:100s] offset 100s)[:] offset 100s`
 		r := netstorage.Result{
 			MetricName: metricNameExpected,
-			Values:     []float64{7, 6, 5, 7.5, 5.5, 3.5},
+			Values:     []float64{0, 0, 0, 3.5, 5.5, 3.5},
 			Timestamps: timestampsExpected,
 		}
 		resultExpected := []netstorage.Result{r}
@ -6746,7 +6746,7 @@ func TestExecSuccess(t *testing.T) {
 	})
 	t.Run(`remove_resets()`, func(t *testing.T) {
 		t.Parallel()
-		q := `remove_resets( abs(1500-time()) )`
+		q := `remove_resets(abs(1500-time()))`
 		r := netstorage.Result{
 			MetricName: metricNameExpected,
 			Values:     []float64{500, 800, 900, 900, 1100, 1300},
@ -6755,6 +6755,20 @@ func TestExecSuccess(t *testing.T) {
 		resultExpected := []netstorage.Result{r}
 		f(q, resultExpected)
 	})
+	t.Run(`remove_resets(sum)`, func(t *testing.T) {
+		t.Parallel()
+		q := `remove_resets(sum(
+			alias(time(), "full"),
+			alias(time()/5 < 300, "partial"),
+		))`
+		r := netstorage.Result{
+			MetricName: metricNameExpected,
+			Values:     []float64{1200, 1440, 1680, 1680, 1880, 2080},
+			Timestamps: timestampsExpected,
+		}
+		resultExpected := []netstorage.Result{r}
+		f(q, resultExpected)
+	})
 	t.Run(`range_avg(time())`, func(t *testing.T) {
 		t.Parallel()
 		q := `range_avg(time())`
@ -6945,10 +6959,10 @@ func TestExecSuccess(t *testing.T) {
 	})
 	t.Run(`aggr_over_time(single-func)`, func(t *testing.T) {
 		t.Parallel()
-		q := `aggr_over_time("increase", rand(0)[:10s])`
+		q := `round(aggr_over_time("increase", rand(0)[:10s]),0.01)`
 		r1 := netstorage.Result{
 			MetricName: metricNameExpected,
-			Values:     []float64{5.465672601448873, 6.642207999066246, 6.8400051805114295, 7.182425481980655, 5.1677922402706, 6.594060518641982},
+			Values:     []float64{5.47, 6.64, 6.84, 7.24, 5.17, 6.59},
 			Timestamps: timestampsExpected,
 		}
 		r1.MetricName.Tags = []storage.Tag{{
--- a/app/vmselect/promql/rollup.go
+++ b/app/vmselect/promql/rollup.go
@ -704,9 +704,9 @@ func removeCounterResets(values []float64) {
 		d := v - prevValue
 		if d < 0 {
 			if (-d * 8) < prevValue {
-				// This is likely jitter from `Prometheus HA pairs`.
-				// Just substitute v with prevValue.
-				v = prevValue
+				// This is likely a partial counter reset.
+				// See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/2787
+				correction += prevValue - v
 			} else {
 				correction += prevValue
 			}
--- a/app/vmselect/promql/rollup_test.go
+++ b/app/vmselect/promql/rollup_test.go
@ -100,10 +100,11 @@ func TestRemoveCounterResets(t *testing.T) {
 	timestampsExpected := []int64{0, 1, 2, 3}
 	testRowsEqual(t, values, timestampsExpected, valuesExpected, timestampsExpected)

-	// verify how jitter from `Prometheus HA pairs` is handled
-	values = []float64{100, 95, 120, 140, 137, 50}
+	// verify how partial counter reset is handled.
+	// See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/2787
+	values = []float64{100, 95, 120, 119, 139, 50}
 	removeCounterResets(values)
-	valuesExpected = []float64{100, 100, 120, 140, 140, 190}
+	valuesExpected = []float64{100, 100, 125, 125, 145, 195}
 	timestampsExpected = []int64{0, 1, 2, 3, 4, 5}
 	testRowsEqual(t, values, timestampsExpected, valuesExpected, timestampsExpected)
 }
--- a/app/vmselect/promql/transform.go
+++ b/app/vmselect/promql/transform.go
@ -2329,9 +2329,9 @@ func removeCounterResetsMaybeNaNs(values []float64) {
 		d := v - prevValue
 		if d < 0 {
 			if (-d * 8) < prevValue {
-				// This is likely jitter from `Prometheus HA pairs`.
-				// Just substitute v with prevValue.
-				v = prevValue
+				// This is likely a partial counter reset.
+				// See https://github.com/VictoriaMetrics/VictoriaMetrics/issues/2787
+				correction += prevValue - v
 			} else {
 				correction += prevValue
 			}
--- a/docs/CHANGELOG.md
+++ b/docs/CHANGELOG.md
@ -35,6 +35,7 @@ scrape_configs:

 * FEATURE: [query tracing](https://docs.victoriametrics.com/Single-server-VictoriaMetrics.html#query-tracing): show timestamps in query traces in human-readable format (aka `RFC3339` in UTC timezone) instead of milliseconds since Unix epoch. For example, `2022-06-27T10:32:54.506Z` instead of `1656325974506`. This improves traces' readability.
 * FEATURE: improve performance of [/api/v1/series](https://prometheus.io/docs/prometheus/latest/querying/api/#finding-series-by-label-matchers) requests, which return big number of time series.
+* FEATURE: [MetricsQL](https://docs.victoriametrics.com/MetricsQL.html): properly handle partial counter resets in [remove_resets](https://docs.victoriametrics.com/MetricsQL.html#remove_resets) function. Now `remove_resets(sum(m))` should returns the expected increasing line when some time series matching `m` disappear on the selected time range. Previously such a query would return horizontal line after the disappeared series.
 * FEATURE: expose additional histogram metrics at `http://victoriametrics:8428/metrics`, which may help understanding query workload:

  * `vm_rows_read_per_query` - the number of raw samples read per query.
@ -51,6 +52,7 @@ scrape_configs:
 {% endraw %}

 * BUGFIX: limit max memory occupied by the cache, which stores parsed regular expressions. Previously too long regular expressions passed in [MetricsQL queries](https://docs.victoriametrics.com/MetricsQL.html) could result in big amounts of used memory (e.g. multiple of gigabytes). Now the max cache size for parsed regexps is limited to a a few megabytes.
+* BUGFIX: [MetricsQL](https://docs.victoriametrics.com/MetricsQL.html): properly handle partial counter resets when calculating [rate](https://docs.victoriametrics.com/MetricsQL.html#rate), [irate](https://docs.victoriametrics.com/MetricsQL.html#irate) and [increase](https://docs.victoriametrics.com/MetricsQL.html#increase) functions. Previously these functions could return zero values after partial counter resets until the counter increases to the last value before partial counter reset. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/2787).
 * BUGFIX: [vmagent](https://docs.victoriametrics.com/vmagent.html): make sure that [stale markers](https://docs.victoriametrics.com/vmagent.html#prometheus-staleness-markers) are generated with the actual timestamp when unsuccessful scrape occurs. This should prevent from possible time series overlap on scrape target restart in dynmaic envirnoments such as Kubernetes.
 * BUGFIX: [vmagent](https://docs.victoriametrics.com/vmagent.html): properly reload changed `-promscrape.config` file when `-promscrape.configCheckInterval` option is set. The changed config file wasn't reloaded in this case since [v1.69.0](#v1690). See [this pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/2786). Thanks to @ttyv for the fix.
 * BUGFIX: [VictoriaMetrics cluster](https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html): assume that the response is complete if `-search.denyPartialResponse` is enabled and up to `-replicationFactor - 1` `vmstorage` nodes are unavailable. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1767).