From 8a786e5df4d7b42c78d5eddf8d9542179b263c05 Mon Sep 17 00:00:00 2001
From: Hui Wang <hui.wang@daocloud.io>
Date: Mon, 30 Oct 2023 20:54:18 +0800
Subject: [PATCH] vmalert: fix alert firing state in replay mode (#5192)

fix possible missing firing states for alerting rules in replay mode
Before if one firing stage is bigger than single query request range, like rule with a big `for`, alerting rule won't able to be detected as firing.

Co-authored-by: hagen1778 <roman@victoriametrics.com>
(cherry picked from commit abcb21aa5ee918ba9a4e9cde495dba06e1e9564c)
---
 app/vmalert/rule/alerting.go      | 33 ++++++++++--
 app/vmalert/rule/alerting_test.go | 90 ++++++++++++++++++++++++++++---
 docs/CHANGELOG.md                 |  1 +
 3 files changed, 112 insertions(+), 12 deletions(-)

diff --git a/app/vmalert/rule/alerting.go b/app/vmalert/rule/alerting.go
index a5a2cabdb7..2139a6f56d 100644
--- a/app/vmalert/rule/alerting.go
+++ b/app/vmalert/rule/alerting.go
@@ -295,24 +295,33 @@ func (ar *AlertingRule) toLabels(m datasource.Metric, qFn templates.QueryFn) (*l
 }
 
 // execRange executes alerting rule on the given time range similarly to exec.
-// It doesn't update internal states of the Rule and meant to be used just
-// to get time series for backfilling.
-// It returns ALERT and ALERT_FOR_STATE time series as result.
+// When making consecutive calls make sure to respect time linearity for start and end params,
+// as this function modifies AlertingRule alerts state.
+// It is not thread safe.
+// It returns ALERT and ALERT_FOR_STATE time series as a result.
 func (ar *AlertingRule) execRange(ctx context.Context, start, end time.Time) ([]prompbmarshal.TimeSeries, error) {
 	res, err := ar.q.QueryRange(ctx, ar.Expr, start, end)
 	if err != nil {
 		return nil, err
 	}
 	var result []prompbmarshal.TimeSeries
+	holdAlertState := make(map[uint64]*notifier.Alert)
 	qFn := func(query string) ([]datasource.Metric, error) {
 		return nil, fmt.Errorf("`query` template isn't supported in replay mode")
 	}
 	for _, s := range res.Data {
+		ls, err := ar.toLabels(s, qFn)
+		if err != nil {
+			return nil, fmt.Errorf("failed to expand labels: %s", err)
+		}
+		h := hash(ls.processed)
 		a, err := ar.newAlert(s, nil, time.Time{}, qFn) // initial alert
 		if err != nil {
 			return nil, fmt.Errorf("failed to create alert: %w", err)
 		}
-		if ar.For == 0 { // if alert is instant
+
+		// if alert is instant, For: 0
+		if ar.For == 0 {
 			a.State = notifier.StateFiring
 			for i := range s.Values {
 				result = append(result, ar.alertToTimeSeries(a, s.Timestamps[i])...)
@@ -324,18 +333,32 @@ func (ar *AlertingRule) execRange(ctx context.Context, start, end time.Time) ([]
 		prevT := time.Time{}
 		for i := range s.Values {
 			at := time.Unix(s.Timestamps[i], 0)
+			// try to restore alert's state on the first iteration
+			if at.Equal(start) {
+				if _, ok := ar.alerts[h]; ok {
+					a = ar.alerts[h]
+					prevT = at
+				}
+			}
 			if at.Sub(prevT) > ar.EvalInterval {
 				// reset to Pending if there are gaps > EvalInterval between DPs
 				a.State = notifier.StatePending
 				a.ActiveAt = at
-			} else if at.Sub(a.ActiveAt) >= ar.For {
+				a.Start = time.Time{}
+			} else if at.Sub(a.ActiveAt) >= ar.For && a.State != notifier.StateFiring {
 				a.State = notifier.StateFiring
 				a.Start = at
 			}
 			prevT = at
 			result = append(result, ar.alertToTimeSeries(a, s.Timestamps[i])...)
+
+			// save alert's state on last iteration, so it can be used on the next execRange call
+			if at.Equal(end) {
+				holdAlertState[h] = a
+			}
 		}
 	}
+	ar.alerts = holdAlertState
 	return result, nil
 }
 
diff --git a/app/vmalert/rule/alerting_test.go b/app/vmalert/rule/alerting_test.go
index cfb8fdbf30..759ffb3fed 100644
--- a/app/vmalert/rule/alerting_test.go
+++ b/app/vmalert/rule/alerting_test.go
@@ -346,15 +346,18 @@ func TestAlertingRule_Exec(t *testing.T) {
 }
 
 func TestAlertingRule_ExecRange(t *testing.T) {
+	fakeGroup := Group{Name: "TestRule_ExecRange"}
 	testCases := []struct {
-		rule      *AlertingRule
-		data      []datasource.Metric
-		expAlerts []*notifier.Alert
+		rule                    *AlertingRule
+		data                    []datasource.Metric
+		expAlerts               []*notifier.Alert
+		expHoldAlertStateAlerts map[uint64]*notifier.Alert
 	}{
 		{
 			newTestAlertingRule("empty", 0),
 			[]datasource.Metric{},
 			nil,
+			nil,
 		},
 		{
 			newTestAlertingRule("empty labels", 0),
@@ -364,6 +367,7 @@ func TestAlertingRule_ExecRange(t *testing.T) {
 			[]*notifier.Alert{
 				{State: notifier.StateFiring},
 			},
+			nil,
 		},
 		{
 			newTestAlertingRule("single-firing", 0),
@@ -376,6 +380,7 @@ func TestAlertingRule_ExecRange(t *testing.T) {
 					State:  notifier.StateFiring,
 				},
 			},
+			nil,
 		},
 		{
 			newTestAlertingRule("single-firing-on-range", 0),
@@ -387,6 +392,7 @@ func TestAlertingRule_ExecRange(t *testing.T) {
 				{State: notifier.StateFiring},
 				{State: notifier.StateFiring},
 			},
+			nil,
 		},
 		{
 			newTestAlertingRule("for-pending", time.Second),
@@ -398,6 +404,16 @@ func TestAlertingRule_ExecRange(t *testing.T) {
 				{State: notifier.StatePending, ActiveAt: time.Unix(3, 0)},
 				{State: notifier.StatePending, ActiveAt: time.Unix(5, 0)},
 			},
+			map[uint64]*notifier.Alert{hash(map[string]string{"alertname": "for-pending"}): {
+				GroupID:     fakeGroup.ID(),
+				Name:        "for-pending",
+				Labels:      map[string]string{"alertname": "for-pending"},
+				Annotations: map[string]string{},
+				State:       notifier.StatePending,
+				ActiveAt:    time.Unix(5, 0),
+				Value:       1,
+				For:         time.Second,
+			}},
 		},
 		{
 			newTestAlertingRule("for-firing", 3*time.Second),
@@ -409,6 +425,38 @@ func TestAlertingRule_ExecRange(t *testing.T) {
 				{State: notifier.StatePending, ActiveAt: time.Unix(1, 0)},
 				{State: notifier.StateFiring, ActiveAt: time.Unix(1, 0)},
 			},
+			map[uint64]*notifier.Alert{hash(map[string]string{"alertname": "for-firing"}): {
+				GroupID:     fakeGroup.ID(),
+				Name:        "for-firing",
+				Labels:      map[string]string{"alertname": "for-firing"},
+				Annotations: map[string]string{},
+				State:       notifier.StateFiring,
+				ActiveAt:    time.Unix(1, 0),
+				Start:       time.Unix(5, 0),
+				Value:       1,
+				For:         3 * time.Second,
+			}},
+		},
+		{
+			newTestAlertingRule("for-hold-pending", time.Second),
+			[]datasource.Metric{
+				{Values: []float64{1, 1, 1}, Timestamps: []int64{1, 2, 5}},
+			},
+			[]*notifier.Alert{
+				{State: notifier.StatePending, ActiveAt: time.Unix(1, 0)},
+				{State: notifier.StateFiring, ActiveAt: time.Unix(1, 0)},
+				{State: notifier.StatePending, ActiveAt: time.Unix(5, 0)},
+			},
+			map[uint64]*notifier.Alert{hash(map[string]string{"alertname": "for-hold-pending"}): {
+				GroupID:     fakeGroup.ID(),
+				Name:        "for-hold-pending",
+				Labels:      map[string]string{"alertname": "for-hold-pending"},
+				Annotations: map[string]string{},
+				State:       notifier.StatePending,
+				ActiveAt:    time.Unix(5, 0),
+				Value:       1,
+				For:         time.Second,
+			}},
 		},
 		{
 			newTestAlertingRule("for=>pending=>firing=>pending=>firing=>pending", time.Second),
@@ -422,9 +470,10 @@ func TestAlertingRule_ExecRange(t *testing.T) {
 				{State: notifier.StateFiring, ActiveAt: time.Unix(5, 0)},
 				{State: notifier.StatePending, ActiveAt: time.Unix(20, 0)},
 			},
+			nil,
 		},
 		{
-			newTestAlertingRule("multi-series-for=>pending=>pending=>firing", 3*time.Second),
+			newTestAlertingRule("multi-series", 3*time.Second),
 			[]datasource.Metric{
 				{Values: []float64{1, 1, 1}, Timestamps: []int64{1, 3, 5}},
 				{
@@ -436,7 +485,6 @@ func TestAlertingRule_ExecRange(t *testing.T) {
 				{State: notifier.StatePending, ActiveAt: time.Unix(1, 0)},
 				{State: notifier.StatePending, ActiveAt: time.Unix(1, 0)},
 				{State: notifier.StateFiring, ActiveAt: time.Unix(1, 0)},
-				//
 				{
 					State: notifier.StatePending, ActiveAt: time.Unix(1, 0),
 					Labels: map[string]string{
@@ -450,6 +498,29 @@ func TestAlertingRule_ExecRange(t *testing.T) {
 					},
 				},
 			},
+			map[uint64]*notifier.Alert{
+				hash(map[string]string{"alertname": "multi-series"}): {
+					GroupID:     fakeGroup.ID(),
+					Name:        "multi-series",
+					Labels:      map[string]string{"alertname": "multi-series"},
+					Annotations: map[string]string{},
+					State:       notifier.StateFiring,
+					ActiveAt:    time.Unix(1, 0),
+					Start:       time.Unix(5, 0),
+					Value:       1,
+					For:         3 * time.Second,
+				},
+				hash(map[string]string{"alertname": "multi-series", "foo": "bar"}): {
+					GroupID:     fakeGroup.ID(),
+					Name:        "multi-series",
+					Labels:      map[string]string{"alertname": "multi-series", "foo": "bar"},
+					Annotations: map[string]string{},
+					State:       notifier.StatePending,
+					ActiveAt:    time.Unix(5, 0),
+					Value:       1,
+					For:         3 * time.Second,
+				},
+			},
 		},
 		{
 			newTestRuleWithLabels("multi-series-firing", "source", "vm"),
@@ -477,16 +548,16 @@ func TestAlertingRule_ExecRange(t *testing.T) {
 					"source": "vm",
 				}},
 			},
+			nil,
 		},
 	}
-	fakeGroup := Group{Name: "TestRule_ExecRange"}
 	for _, tc := range testCases {
 		t.Run(tc.rule.Name, func(t *testing.T) {
 			fq := &datasource.FakeQuerier{}
 			tc.rule.q = fq
 			tc.rule.GroupID = fakeGroup.ID()
 			fq.Add(tc.data...)
-			gotTS, err := tc.rule.execRange(context.TODO(), time.Now(), time.Now())
+			gotTS, err := tc.rule.execRange(context.TODO(), time.Unix(1, 0), time.Unix(5, 0))
 			if err != nil {
 				t.Fatalf("unexpected err: %s", err)
 			}
@@ -512,6 +583,11 @@ func TestAlertingRule_ExecRange(t *testing.T) {
 					t.Fatalf("%d: expected \n%v but got \n%v", i, exp, got)
 				}
 			}
+			if tc.expHoldAlertStateAlerts != nil {
+				if !reflect.DeepEqual(tc.expHoldAlertStateAlerts, tc.rule.alerts) {
+					t.Fatalf("expected hold alerts state: \n%v but got \n%v", tc.expHoldAlertStateAlerts, tc.rule.alerts)
+				}
+			}
 		})
 	}
 }
diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md
index 6a08f730a2..a7a4462dd1 100644
--- a/docs/CHANGELOG.md
+++ b/docs/CHANGELOG.md
@@ -59,6 +59,7 @@ The sandbox cluster installation is running under the constant load generated by
 * BUGFIX: [vmalert](https://docs.victoriametrics.com/vmalert.html): strip sensitive information such as auth headers or passwords from datasource, remote-read, remote-write or notifier URLs in log messages or UI. This behavior is by default and is controlled via `-datasource.showURL`, `-remoteRead.showURL`, `remoteWrite.showURL` or `-notifier.showURL` cmd-line flags. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5044).
 * BUGFIX: [vmalert](https://docs.victoriametrics.com/vmalert.html): fix vmalert web UI when running on 32-bit architectures machine.
 * BUGFIX: [vmalert](https://docs.victoriametrics.com/vmalert.html): do not send requests to configured remote systems when `-datasource.*`, `-remoteWrite.*`, `-remoteRead.*` or `-notifier.*` command-line flags refer files with invalid auth configs. Previously such requests were sent without properly set auth headers. Now the requests are sent only after the files are updated with valid auth configs. See [this pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5153).
+* BUGFIX: [vmalert](https://docs.victoriametrics.com/vmalert.html): properly maintain alerts state in [replay mode](https://docs.victoriametrics.com/vmalert.html#rules-backfilling) if alert's `for` param was bigger than replay request range (usually a couple of hours). See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5186) for details.
 * BUGFIX: `vmselect`: improve performance and memory usage during query processing on machines with big number of CPU cores. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5087).
 * BUGFIX: dashboards: fix vminsert/vmstorage/vmselect metrics filtering when dashboard is used to display data from many sub-clusters with unique job names. Before, only one specific job could have been accounted for component-specific panels, instead of all available jobs for the component.
 * BUGFIX: dashboards/vmalert: apply `desc` sorting in tooltips for vmalert dashboard in order to improve visibility of the outliers on graph.