vmalert: fix alert firing state in replay mode (#5192)

fix possible missing firing states for alerting rules in replay mode Before if one firing stage is bigger than single query request range, like rule with a big `for`, alerting rule won't able to be detected as firing. Co-authored-by: hagen1778 <roman@victoriametrics.com> (cherry picked from commit abcb21aa5e)
2024-12-14 16:12:15 +01:00 · 2023-10-30 20:54:18 +08:00 · 2023-10-30 20:54:18 +08:00 · 8a786e5df4
commit 8a786e5df4
parent f0d10e2004
3 changed files with 112 additions and 12 deletions
--- a/app/vmalert/rule/alerting.go
+++ b/app/vmalert/rule/alerting.go
@ -295,24 +295,33 @@ func (ar *AlertingRule) toLabels(m datasource.Metric, qFn templates.QueryFn) (*l
 }

 // execRange executes alerting rule on the given time range similarly to exec.
-// It doesn't update internal states of the Rule and meant to be used just
-// to get time series for backfilling.
-// It returns ALERT and ALERT_FOR_STATE time series as result.
+// When making consecutive calls make sure to respect time linearity for start and end params,
+// as this function modifies AlertingRule alerts state.
+// It is not thread safe.
+// It returns ALERT and ALERT_FOR_STATE time series as a result.
 func (ar *AlertingRule) execRange(ctx context.Context, start, end time.Time) ([]prompbmarshal.TimeSeries, error) {
 	res, err := ar.q.QueryRange(ctx, ar.Expr, start, end)
 	if err != nil {
 		return nil, err
 	}
 	var result []prompbmarshal.TimeSeries
+	holdAlertState := make(map[uint64]*notifier.Alert)
 	qFn := func(query string) ([]datasource.Metric, error) {
 		return nil, fmt.Errorf("`query` template isn't supported in replay mode")
 	}
 	for _, s := range res.Data {
+		ls, err := ar.toLabels(s, qFn)
+		if err != nil {
+			return nil, fmt.Errorf("failed to expand labels: %s", err)
+		}
+		h := hash(ls.processed)
 		a, err := ar.newAlert(s, nil, time.Time{}, qFn) // initial alert
 		if err != nil {
 			return nil, fmt.Errorf("failed to create alert: %w", err)
 		}
-		if ar.For == 0 { // if alert is instant
+
+		// if alert is instant, For: 0
+		if ar.For == 0 {
 			a.State = notifier.StateFiring
 			for i := range s.Values {
 				result = append(result, ar.alertToTimeSeries(a, s.Timestamps[i])...)
@ -324,18 +333,32 @@ func (ar *AlertingRule) execRange(ctx context.Context, start, end time.Time) ([]
 		prevT := time.Time{}
 		for i := range s.Values {
 			at := time.Unix(s.Timestamps[i], 0)
+			// try to restore alert's state on the first iteration
+			if at.Equal(start) {
+				if _, ok := ar.alerts[h]; ok {
+					a = ar.alerts[h]
+					prevT = at
+				}
+			}
 			if at.Sub(prevT) > ar.EvalInterval {
 				// reset to Pending if there are gaps > EvalInterval between DPs
 				a.State = notifier.StatePending
 				a.ActiveAt = at
-			} else if at.Sub(a.ActiveAt) >= ar.For {
+				a.Start = time.Time{}
+			} else if at.Sub(a.ActiveAt) >= ar.For && a.State != notifier.StateFiring {
 				a.State = notifier.StateFiring
 				a.Start = at
 			}
 			prevT = at
 			result = append(result, ar.alertToTimeSeries(a, s.Timestamps[i])...)
+
+			// save alert's state on last iteration, so it can be used on the next execRange call
+			if at.Equal(end) {
+				holdAlertState[h] = a
+			}
 		}
 	}
+	ar.alerts = holdAlertState
 	return result, nil
 }

--- a/app/vmalert/rule/alerting_test.go
+++ b/app/vmalert/rule/alerting_test.go
@ -346,15 +346,18 @@ func TestAlertingRule_Exec(t *testing.T) {
 }

 func TestAlertingRule_ExecRange(t *testing.T) {
+	fakeGroup := Group{Name: "TestRule_ExecRange"}
 	testCases := []struct {
-		rule      *AlertingRule
-		data      []datasource.Metric
-		expAlerts []*notifier.Alert
+		rule                    *AlertingRule
+		data                    []datasource.Metric
+		expAlerts               []*notifier.Alert
+		expHoldAlertStateAlerts map[uint64]*notifier.Alert
 	}{
 		{
 			newTestAlertingRule("empty", 0),
 			[]datasource.Metric{},
 			nil,
+			nil,
 		},
 		{
 			newTestAlertingRule("empty labels", 0),
@ -364,6 +367,7 @@ func TestAlertingRule_ExecRange(t *testing.T) {
 			[]*notifier.Alert{
 				{State: notifier.StateFiring},
 			},
+			nil,
 		},
 		{
 			newTestAlertingRule("single-firing", 0),
@ -376,6 +380,7 @@ func TestAlertingRule_ExecRange(t *testing.T) {
 					State:  notifier.StateFiring,
 				},
 			},
+			nil,
 		},
 		{
 			newTestAlertingRule("single-firing-on-range", 0),
@ -387,6 +392,7 @@ func TestAlertingRule_ExecRange(t *testing.T) {
 				{State: notifier.StateFiring},
 				{State: notifier.StateFiring},
 			},
+			nil,
 		},
 		{
 			newTestAlertingRule("for-pending", time.Second),
@ -398,6 +404,16 @@ func TestAlertingRule_ExecRange(t *testing.T) {
 				{State: notifier.StatePending, ActiveAt: time.Unix(3, 0)},
 				{State: notifier.StatePending, ActiveAt: time.Unix(5, 0)},
 			},
+			map[uint64]*notifier.Alert{hash(map[string]string{"alertname": "for-pending"}): {
+				GroupID:     fakeGroup.ID(),
+				Name:        "for-pending",
+				Labels:      map[string]string{"alertname": "for-pending"},
+				Annotations: map[string]string{},
+				State:       notifier.StatePending,
+				ActiveAt:    time.Unix(5, 0),
+				Value:       1,
+				For:         time.Second,
+			}},
 		},
 		{
 			newTestAlertingRule("for-firing", 3*time.Second),
@ -409,6 +425,38 @@ func TestAlertingRule_ExecRange(t *testing.T) {
 				{State: notifier.StatePending, ActiveAt: time.Unix(1, 0)},
 				{State: notifier.StateFiring, ActiveAt: time.Unix(1, 0)},
 			},
+			map[uint64]*notifier.Alert{hash(map[string]string{"alertname": "for-firing"}): {
+				GroupID:     fakeGroup.ID(),
+				Name:        "for-firing",
+				Labels:      map[string]string{"alertname": "for-firing"},
+				Annotations: map[string]string{},
+				State:       notifier.StateFiring,
+				ActiveAt:    time.Unix(1, 0),
+				Start:       time.Unix(5, 0),
+				Value:       1,
+				For:         3 * time.Second,
+			}},
+		},
+		{
+			newTestAlertingRule("for-hold-pending", time.Second),
+			[]datasource.Metric{
+				{Values: []float64{1, 1, 1}, Timestamps: []int64{1, 2, 5}},
+			},
+			[]*notifier.Alert{
+				{State: notifier.StatePending, ActiveAt: time.Unix(1, 0)},
+				{State: notifier.StateFiring, ActiveAt: time.Unix(1, 0)},
+				{State: notifier.StatePending, ActiveAt: time.Unix(5, 0)},
+			},
+			map[uint64]*notifier.Alert{hash(map[string]string{"alertname": "for-hold-pending"}): {
+				GroupID:     fakeGroup.ID(),
+				Name:        "for-hold-pending",
+				Labels:      map[string]string{"alertname": "for-hold-pending"},
+				Annotations: map[string]string{},
+				State:       notifier.StatePending,
+				ActiveAt:    time.Unix(5, 0),
+				Value:       1,
+				For:         time.Second,
+			}},
 		},
 		{
 			newTestAlertingRule("for=>pending=>firing=>pending=>firing=>pending", time.Second),
@ -422,9 +470,10 @@ func TestAlertingRule_ExecRange(t *testing.T) {
 				{State: notifier.StateFiring, ActiveAt: time.Unix(5, 0)},
 				{State: notifier.StatePending, ActiveAt: time.Unix(20, 0)},
 			},
+			nil,
 		},
 		{
-			newTestAlertingRule("multi-series-for=>pending=>pending=>firing", 3*time.Second),
+			newTestAlertingRule("multi-series", 3*time.Second),
 			[]datasource.Metric{
 				{Values: []float64{1, 1, 1}, Timestamps: []int64{1, 3, 5}},
 				{
@ -436,7 +485,6 @@ func TestAlertingRule_ExecRange(t *testing.T) {
 				{State: notifier.StatePending, ActiveAt: time.Unix(1, 0)},
 				{State: notifier.StatePending, ActiveAt: time.Unix(1, 0)},
 				{State: notifier.StateFiring, ActiveAt: time.Unix(1, 0)},
-				//
 				{
 					State: notifier.StatePending, ActiveAt: time.Unix(1, 0),
 					Labels: map[string]string{
@ -450,6 +498,29 @@ func TestAlertingRule_ExecRange(t *testing.T) {
 					},
 				},
 			},
+			map[uint64]*notifier.Alert{
+				hash(map[string]string{"alertname": "multi-series"}): {
+					GroupID:     fakeGroup.ID(),
+					Name:        "multi-series",
+					Labels:      map[string]string{"alertname": "multi-series"},
+					Annotations: map[string]string{},
+					State:       notifier.StateFiring,
+					ActiveAt:    time.Unix(1, 0),
+					Start:       time.Unix(5, 0),
+					Value:       1,
+					For:         3 * time.Second,
+				},
+				hash(map[string]string{"alertname": "multi-series", "foo": "bar"}): {
+					GroupID:     fakeGroup.ID(),
+					Name:        "multi-series",
+					Labels:      map[string]string{"alertname": "multi-series", "foo": "bar"},
+					Annotations: map[string]string{},
+					State:       notifier.StatePending,
+					ActiveAt:    time.Unix(5, 0),
+					Value:       1,
+					For:         3 * time.Second,
+				},
+			},
 		},
 		{
 			newTestRuleWithLabels("multi-series-firing", "source", "vm"),
@ -477,16 +548,16 @@ func TestAlertingRule_ExecRange(t *testing.T) {
 					"source": "vm",
 				}},
 			},
+			nil,
 		},
 	}
-	fakeGroup := Group{Name: "TestRule_ExecRange"}
 	for _, tc := range testCases {
 		t.Run(tc.rule.Name, func(t *testing.T) {
 			fq := &datasource.FakeQuerier{}
 			tc.rule.q = fq
 			tc.rule.GroupID = fakeGroup.ID()
 			fq.Add(tc.data...)
-			gotTS, err := tc.rule.execRange(context.TODO(), time.Now(), time.Now())
+			gotTS, err := tc.rule.execRange(context.TODO(), time.Unix(1, 0), time.Unix(5, 0))
 			if err != nil {
 				t.Fatalf("unexpected err: %s", err)
 			}
@ -512,6 +583,11 @@ func TestAlertingRule_ExecRange(t *testing.T) {
 					t.Fatalf("%d: expected \n%v but got \n%v", i, exp, got)
 				}
 			}
+			if tc.expHoldAlertStateAlerts != nil {
+				if !reflect.DeepEqual(tc.expHoldAlertStateAlerts, tc.rule.alerts) {
+					t.Fatalf("expected hold alerts state: \n%v but got \n%v", tc.expHoldAlertStateAlerts, tc.rule.alerts)
+				}
+			}
 		})
 	}
 }
--- a/docs/CHANGELOG.md
+++ b/docs/CHANGELOG.md
@ -59,6 +59,7 @@ The sandbox cluster installation is running under the constant load generated by
 * BUGFIX: [vmalert](https://docs.victoriametrics.com/vmalert.html): strip sensitive information such as auth headers or passwords from datasource, remote-read, remote-write or notifier URLs in log messages or UI. This behavior is by default and is controlled via `-datasource.showURL`, `-remoteRead.showURL`, `remoteWrite.showURL` or `-notifier.showURL` cmd-line flags. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5044).
 * BUGFIX: [vmalert](https://docs.victoriametrics.com/vmalert.html): fix vmalert web UI when running on 32-bit architectures machine.
 * BUGFIX: [vmalert](https://docs.victoriametrics.com/vmalert.html): do not send requests to configured remote systems when `-datasource.*`, `-remoteWrite.*`, `-remoteRead.*` or `-notifier.*` command-line flags refer files with invalid auth configs. Previously such requests were sent without properly set auth headers. Now the requests are sent only after the files are updated with valid auth configs. See [this pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5153).
+* BUGFIX: [vmalert](https://docs.victoriametrics.com/vmalert.html): properly maintain alerts state in [replay mode](https://docs.victoriametrics.com/vmalert.html#rules-backfilling) if alert's `for` param was bigger than replay request range (usually a couple of hours). See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5186) for details.
 * BUGFIX: `vmselect`: improve performance and memory usage during query processing on machines with big number of CPU cores. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5087).
 * BUGFIX: dashboards: fix vminsert/vmstorage/vmselect metrics filtering when dashboard is used to display data from many sub-clusters with unique job names. Before, only one specific job could have been accounted for component-specific panels, instead of all available jobs for the component.
 * BUGFIX: dashboards/vmalert: apply `desc` sorting in tooltips for vmalert dashboard in order to improve visibility of the outliers on graph.