From a2f83115ae697bb2e0eb20e11191aeef2692ada8 Mon Sep 17 00:00:00 2001
From: Roman Khavronenko <roman@victoriametrics.com>
Date: Thu, 25 Jan 2024 15:42:57 +0100
Subject: [PATCH] app/vmalert: autogenerate `ALERTS_FOR_STATE` time series for
 alerting  rules with `for: 0` (#5680)

* app/vmalert: autogenerate `ALERTS_FOR_STATE` time series for alerting rules with `for: 0`

 Previously, `ALERTS_FOR_STATE` was generated only for alerts with `for > 0`.
 This behavior differs from Prometheus behavior - it generates ALERTS_FOR_STATE
 time series for alerting rules with `for: 0` as well. Such time series can
 be useful for tracking the moment when alerting rule became active.

 Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5648
 https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3056

Signed-off-by: hagen1778 <roman@victoriametrics.com>

* app/vmalert: support ALERTS_FOR_STATE in `replay` mode

Signed-off-by: hagen1778 <roman@victoriametrics.com>

---------

Signed-off-by: hagen1778 <roman@victoriametrics.com>
---
 app/vmalert/Makefile              |   1 +
 app/vmalert/rule/alerting.go      |  26 +++-----
 app/vmalert/rule/alerting_test.go | 103 +++++++++++++++++++++---------
 docs/CHANGELOG.md                 |   1 +
 4 files changed, 85 insertions(+), 46 deletions(-)

diff --git a/app/vmalert/Makefile b/app/vmalert/Makefile
index 9f200f0c22..0c57ffb299 100644
--- a/app/vmalert/Makefile
+++ b/app/vmalert/Makefile
@@ -68,6 +68,7 @@ publish-vmalert:
 
 test-vmalert:
 	go test -v -race -cover ./app/vmalert -loggerLevel=ERROR
+	go test -v -race -cover ./app/vmalert/rule
 	go test -v -race -cover ./app/vmalert/templates
 	go test -v -race -cover ./app/vmalert/datasource
 	go test -v -race -cover ./app/vmalert/notifier
diff --git a/app/vmalert/rule/alerting.go b/app/vmalert/rule/alerting.go
index 7ca4427eb7..be50ef1335 100644
--- a/app/vmalert/rule/alerting.go
+++ b/app/vmalert/rule/alerting.go
@@ -324,16 +324,6 @@ func (ar *AlertingRule) execRange(ctx context.Context, start, end time.Time) ([]
 			return nil, fmt.Errorf("failed to create alert: %w", err)
 		}
 
-		// if alert is instant, For: 0
-		if ar.For == 0 {
-			a.State = notifier.StateFiring
-			for i := range s.Values {
-				result = append(result, ar.alertToTimeSeries(a, s.Timestamps[i])...)
-			}
-			continue
-		}
-
-		// if alert with For > 0
 		prevT := time.Time{}
 		for i := range s.Values {
 			at := time.Unix(s.Timestamps[i], 0)
@@ -354,6 +344,10 @@ func (ar *AlertingRule) execRange(ctx context.Context, start, end time.Time) ([]
 				a.Start = at
 			}
 			prevT = at
+			if ar.For == 0 {
+				// rules with `for: 0` are always firing when they have Value
+				a.State = notifier.StateFiring
+			}
 			result = append(result, ar.alertToTimeSeries(a, s.Timestamps[i])...)
 
 			// save alert's state on last iteration, so it can be used on the next execRange call
@@ -559,9 +553,9 @@ func (ar *AlertingRule) newAlert(m datasource.Metric, ls *labelSet, start time.T
 }
 
 const (
-	// alertMetricName is the metric name for synthetic alert timeseries.
+	// alertMetricName is the metric name for time series reflecting the alert state.
 	alertMetricName = "ALERTS"
-	// alertForStateMetricName is the metric name for 'for' state of alert.
+	// alertForStateMetricName is the metric name for time series reflecting the moment of time when alert became active.
 	alertForStateMetricName = "ALERTS_FOR_STATE"
 
 	// alertNameLabel is the label name indicating the name of an alert.
@@ -576,12 +570,10 @@ const (
 
 // alertToTimeSeries converts the given alert with the given timestamp to time series
 func (ar *AlertingRule) alertToTimeSeries(a *notifier.Alert, timestamp int64) []prompbmarshal.TimeSeries {
-	var tss []prompbmarshal.TimeSeries
-	tss = append(tss, alertToTimeSeries(a, timestamp))
-	if ar.For > 0 {
-		tss = append(tss, alertForToTimeSeries(a, timestamp))
+	return []prompbmarshal.TimeSeries{
+		alertToTimeSeries(a, timestamp),
+		alertForToTimeSeries(a, timestamp),
 	}
-	return tss
 }
 
 func alertToTimeSeries(a *notifier.Alert, timestamp int64) prompbmarshal.TimeSeries {
diff --git a/app/vmalert/rule/alerting_test.go b/app/vmalert/rule/alerting_test.go
index 3e5e3503f9..5b7fb54045 100644
--- a/app/vmalert/rule/alerting_test.go
+++ b/app/vmalert/rule/alerting_test.go
@@ -28,20 +28,26 @@ func TestAlertingRule_ToTimeSeries(t *testing.T) {
 	}{
 		{
 			newTestAlertingRule("instant", 0),
-			&notifier.Alert{State: notifier.StateFiring},
+			&notifier.Alert{State: notifier.StateFiring, ActiveAt: timestamp.Add(time.Second)},
 			[]prompbmarshal.TimeSeries{
 				newTimeSeries([]float64{1}, []int64{timestamp.UnixNano()}, map[string]string{
 					"__name__":      alertMetricName,
 					alertStateLabel: notifier.StateFiring.String(),
 				}),
+				newTimeSeries([]float64{float64(timestamp.Add(time.Second).Unix())},
+					[]int64{timestamp.UnixNano()},
+					map[string]string{
+						"__name__": alertForStateMetricName,
+					}),
 			},
 		},
 		{
 			newTestAlertingRule("instant extra labels", 0),
-			&notifier.Alert{State: notifier.StateFiring, Labels: map[string]string{
-				"job":      "foo",
-				"instance": "bar",
-			}},
+			&notifier.Alert{State: notifier.StateFiring, ActiveAt: timestamp.Add(time.Second),
+				Labels: map[string]string{
+					"job":      "foo",
+					"instance": "bar",
+				}},
 			[]prompbmarshal.TimeSeries{
 				newTimeSeries([]float64{1}, []int64{timestamp.UnixNano()}, map[string]string{
 					"__name__":      alertMetricName,
@@ -49,19 +55,33 @@ func TestAlertingRule_ToTimeSeries(t *testing.T) {
 					"job":           "foo",
 					"instance":      "bar",
 				}),
+				newTimeSeries([]float64{float64(timestamp.Add(time.Second).Unix())},
+					[]int64{timestamp.UnixNano()},
+					map[string]string{
+						"__name__": alertForStateMetricName,
+						"job":      "foo",
+						"instance": "bar",
+					}),
 			},
 		},
 		{
 			newTestAlertingRule("instant labels override", 0),
-			&notifier.Alert{State: notifier.StateFiring, Labels: map[string]string{
-				alertStateLabel: "foo",
-				"__name__":      "bar",
-			}},
+			&notifier.Alert{State: notifier.StateFiring, ActiveAt: timestamp.Add(time.Second),
+				Labels: map[string]string{
+					alertStateLabel: "foo",
+					"__name__":      "bar",
+				}},
 			[]prompbmarshal.TimeSeries{
 				newTimeSeries([]float64{1}, []int64{timestamp.UnixNano()}, map[string]string{
 					"__name__":      alertMetricName,
 					alertStateLabel: notifier.StateFiring.String(),
 				}),
+				newTimeSeries([]float64{float64(timestamp.Add(time.Second).Unix())},
+					[]int64{timestamp.UnixNano()},
+					map[string]string{
+						"__name__":      alertForStateMetricName,
+						alertStateLabel: "foo",
+					}),
 			},
 		},
 		{
@@ -367,7 +387,7 @@ func TestAlertingRule_ExecRange(t *testing.T) {
 				{Values: []float64{1}, Timestamps: []int64{1}},
 			},
 			[]*notifier.Alert{
-				{State: notifier.StateFiring},
+				{State: notifier.StateFiring, ActiveAt: time.Unix(1, 0)},
 			},
 			nil,
 		},
@@ -378,8 +398,9 @@ func TestAlertingRule_ExecRange(t *testing.T) {
 			},
 			[]*notifier.Alert{
 				{
-					Labels: map[string]string{"name": "foo"},
-					State:  notifier.StateFiring,
+					Labels:   map[string]string{"name": "foo"},
+					State:    notifier.StateFiring,
+					ActiveAt: time.Unix(1, 0),
 				},
 			},
 			nil,
@@ -390,9 +411,9 @@ func TestAlertingRule_ExecRange(t *testing.T) {
 				{Values: []float64{1, 1, 1}, Timestamps: []int64{1e3, 2e3, 3e3}},
 			},
 			[]*notifier.Alert{
-				{State: notifier.StateFiring},
-				{State: notifier.StateFiring},
-				{State: notifier.StateFiring},
+				{State: notifier.StateFiring, ActiveAt: time.Unix(1e3, 0)},
+				{State: notifier.StateFiring, ActiveAt: time.Unix(2e3, 0)},
+				{State: notifier.StateFiring, ActiveAt: time.Unix(3e3, 0)},
 			},
 			nil,
 		},
@@ -460,6 +481,20 @@ func TestAlertingRule_ExecRange(t *testing.T) {
 				For:         time.Second,
 			}},
 		},
+		{
+			newTestAlertingRuleWithEvalInterval("firing=>inactive=>inactive=>firing=>firing", 0, time.Second),
+			[]datasource.Metric{
+				{Values: []float64{1, 1, 1, 1}, Timestamps: []int64{1, 4, 5, 6}},
+			},
+			[]*notifier.Alert{
+				{State: notifier.StateFiring, ActiveAt: time.Unix(1, 0)},
+				// It is expected for ActiveAT to remain the same while rule continues to fire in each iteration
+				{State: notifier.StateFiring, ActiveAt: time.Unix(4, 0)},
+				{State: notifier.StateFiring, ActiveAt: time.Unix(4, 0)},
+				{State: notifier.StateFiring, ActiveAt: time.Unix(4, 0)},
+			},
+			nil,
+		},
 		{
 			newTestAlertingRule("for=>pending=>firing=>pending=>firing=>pending", time.Second),
 			[]datasource.Metric{
@@ -534,21 +569,25 @@ func TestAlertingRule_ExecRange(t *testing.T) {
 				},
 			},
 			[]*notifier.Alert{
-				{State: notifier.StateFiring, Labels: map[string]string{
-					"source": "vm",
-				}},
-				{State: notifier.StateFiring, Labels: map[string]string{
-					"source": "vm",
-				}},
+				{State: notifier.StateFiring, ActiveAt: time.Unix(1, 0),
+					Labels: map[string]string{
+						"source": "vm",
+					}},
+				{State: notifier.StateFiring, ActiveAt: time.Unix(100, 0),
+					Labels: map[string]string{
+						"source": "vm",
+					}},
 				//
-				{State: notifier.StateFiring, Labels: map[string]string{
-					"foo":    "bar",
-					"source": "vm",
-				}},
-				{State: notifier.StateFiring, Labels: map[string]string{
-					"foo":    "bar",
-					"source": "vm",
-				}},
+				{State: notifier.StateFiring, ActiveAt: time.Unix(1, 0),
+					Labels: map[string]string{
+						"foo":    "bar",
+						"source": "vm",
+					}},
+				{State: notifier.StateFiring, ActiveAt: time.Unix(5, 0),
+					Labels: map[string]string{
+						"foo":    "bar",
+						"source": "vm",
+					}},
 			},
 			nil,
 		},
@@ -1095,6 +1134,12 @@ func newTestAlertingRule(name string, waitFor time.Duration) *AlertingRule {
 	return &rule
 }
 
+func newTestAlertingRuleWithEvalInterval(name string, waitFor, evalInterval time.Duration) *AlertingRule {
+	rule := newTestAlertingRule(name, waitFor)
+	rule.EvalInterval = evalInterval
+	return rule
+}
+
 func newTestAlertingRuleWithKeepFiring(name string, waitFor, keepFiringFor time.Duration) *AlertingRule {
 	rule := newTestAlertingRule(name, waitFor)
 	rule.KeepFiringFor = keepFiringFor
diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md
index c7f0746202..96d6cdccc6 100644
--- a/docs/CHANGELOG.md
+++ b/docs/CHANGELOG.md
@@ -70,6 +70,7 @@ The sandbox cluster installation is running under the constant load generated by
 * BUGFIX: `vmstorage`: properly expire `storage/prefetchedMetricIDs` cache. Previously this cache was never expired, so it could grow big under [high churn rate](https://docs.victoriametrics.com/FAQ.html#what-is-high-churn-rate). This could result in increasing CPU load over time.
 * BUGFIX: [vmalert](https://docs.victoriametrics.com/vmalert.html): check `-external.url` schema when starting vmalert, must be `http` or `https`. Before, alertmanager could reject alert notifications if `-external.url` contained no or wrong schema.
 * BUGFIX: [vmalert](https://docs.victoriametrics.com/vmalert.html): automatically add `exported_` prefix for original evaluation result label if it's conflicted with external or reserved one, previously it was overridden. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5161).
+* BUGFIX: [vmalert](https://docs.victoriametrics.com/vmalert.html): autogenerate `ALERTS_FOR_STATE` time series for alerting rules with `for: 0`. Previously, `ALERTS_FOR_STATE` was generated only for alerts with `for > 0`. The change aligns with Prometheus behavior. See more details in [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5648). 
 * BUGFIX: [MetricsQL](https://docs.victoriametrics.com/MetricsQL.html): consistently sort results for `q1 or q2` query, so they do not change colors with each refresh in Grafana. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5393).
 * BUGFIX: [MetricsQL](https://docs.victoriametrics.com/MetricsQL.html): properly return results from [bottomk](https://docs.victoriametrics.com/MetricsQL.html#bottomk) and `bottomk_*()` functions when some of these results contain NaN values. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5506). Thanks to @xiaozongyang for [the fix](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5509).
 * BUGFIX: [MetricsQL](https://docs.victoriametrics.com/MetricsQL.html): properly handle queries, which wrap [rollup functions](https://docs.victoriametrics.com/MetricsQL.html#rollup-functions) with multiple arguments without explicitly specified lookbehind window in square brackets into [aggregate functions](https://docs.victoriametrics.com/MetricsQL.html#aggregate-functions). For example, `sum(quantile_over_time(0.5, process_resident_memory_bytes))` was resulting to `expecting at least 2 args to ...; got 1 args` error. Thanks to @atykhyy for [the pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5414).