vmalert: avoid sending resolves for pending alerts (#498)

Before the change we were sending notifications to notifier if following conditions are met: * alert is in Fire state * alert is in Inactive state We were sending Inactive notifications to resolve alert ASAP. Unfortunately, we were sending resolves for Pending alerts that become Inactive, which is wrong. In this change we delete alert from the active list if it was Pending and become Inactive. In this way we now have Inactive alerts only if they were in state Fire before. See test change for example.
2024-12-15 08:23:34 +01:00 · 2020-05-17 15:13:22 +01:00 · 2020-05-17 15:13:22 +01:00 · c7f3e58032
commit c7f3e58032
parent e5f5342e18
3 changed files with 48 additions and 33 deletions
--- a/app/vmalert/group.go
+++ b/app/vmalert/group.go
@ -69,7 +69,7 @@ func (g *Group) updateWith(newGroup Group) {

 		// copy all significant fields.
 		// alerts state isn't copied since
-		// it should be updated in next 2 Evals
+		// it should be updated in next 2 Execs
 		or.For = nr.For
 		or.Expr = nr.Expr
 		or.Labels = nr.Labels
@ -151,30 +151,47 @@ func (g *Group) start(ctx context.Context, interval time.Duration,

 				var alertsToSend []notifier.Alert
 				for _, a := range rule.alerts {
-					if a.State != notifier.StatePending {
+					switch a.State {
+					case notifier.StateFiring:
+						// set End to execStart + 3 intervals
+						// so notifier can resolve it automatically if `vmalert`
+						// won't be able to send resolve for some reason
+						a.End = execStart.Add(3 * interval)
+						alertsToSend = append(alertsToSend, *a)
+						pushToRW(rw, rule, a, execStart)
+					case notifier.StatePending:
+						pushToRW(rw, rule, a, execStart)
+					case notifier.StateInactive:
+						// set End to execStart to notify
+						// that it was just resolved
+						a.End = execStart
 						alertsToSend = append(alertsToSend, *a)
 					}
-					if a.State == notifier.StateInactive || rw == nil {
-						continue
-					}
-					tss := rule.AlertToTimeSeries(a, execStart)
-					for _, ts := range tss {
-						remoteWriteSent.Inc()
-						if err := rw.Push(ts); err != nil {
-							remoteWriteErrors.Inc()
-							logger.Errorf("failed to push timeseries to remotewrite: %s", err)
-						}
-					}
 				}
-				if len(alertsToSend) > 0 {
-					alertsSent.Add(len(alertsToSend))
-					if err := nr.Send(ctx, alertsToSend); err != nil {
-						alertsSendErrors.Inc()
-						logger.Errorf("failed to send alert for rule %q.%q: %s", g.Name, rule.Name, err)
-					}
+				if len(alertsToSend) == 0 {
+					continue
+				}
+				alertsSent.Add(len(alertsToSend))
+				if err := nr.Send(ctx, alertsToSend); err != nil {
+					alertsSendErrors.Inc()
+					logger.Errorf("failed to send alert for rule %q.%q: %s", g.Name, rule.Name, err)
 				}
 			}
 			iterationDuration.UpdateDuration(iterationStart)
 		}
 	}
 }
+
+func pushToRW(rw *remotewrite.Client, rule *Rule, a *notifier.Alert, timestamp time.Time) {
+	if rw == nil {
+		return
+	}
+	tss := rule.AlertToTimeSeries(a, timestamp)
+	remoteWriteSent.Add(len(tss))
+	for _, ts := range tss {
+		if err := rw.Push(ts); err != nil {
+			remoteWriteErrors.Inc()
+			logger.Errorf("failed to push timeseries to remotewrite: %s", err)
+		}
+	}
+}
--- a/app/vmalert/rule.go
+++ b/app/vmalert/rule.go
@ -71,7 +71,7 @@ func (r *Rule) Exec(ctx context.Context, q datasource.Querier) error {
 	}

 	for h, a := range r.alerts {
-		// cleanup inactive alerts from previous Eval
+		// cleanup inactive alerts from previous Exec
 		if a.State == notifier.StateInactive {
 			delete(r.alerts, h)
 		}
@ -109,19 +109,19 @@ func (r *Rule) Exec(ctx context.Context, q datasource.Querier) error {
 		// if alert wasn't updated in this iteration
 		// means it is resolved already
 		if _, ok := updated[h]; !ok {
+			if a.State == notifier.StatePending {
+				// alert was in Pending state - it is not
+				// active anymore
+				delete(r.alerts, h)
+				continue
+			}
 			a.State = notifier.StateInactive
-			// set endTime to last execution time
-			// so it can be sent by notifier on next step
-			a.End = r.lastExecTime
 			continue
 		}
 		if a.State == notifier.StatePending && time.Since(a.Start) >= r.For {
 			a.State = notifier.StateFiring
 			alertsFired.Inc()
 		}
-		if a.State == notifier.StateFiring {
-			a.End = r.lastExecTime.Add(3 * *evaluationInterval)
-		}
 	}
 	return nil
 }
@ -293,7 +293,7 @@ func newTimeSeries(value float64, labels map[string]string, timestamp time.Time)

 // Restore restores the state of active alerts basing on previously written timeseries.
 // Restore restores only Start field. Field State will be always Pending and supposed
-// to be updated on next Eval, as well as Value field.
+// to be updated on next Exec, as well as Value field.
 func (r *Rule) Restore(ctx context.Context, q datasource.Querier, lookback time.Duration) error {
 	if q == nil {
 		return fmt.Errorf("querier is nil")
@ -313,7 +313,7 @@ func (r *Rule) Restore(ctx context.Context, q datasource.Querier, lookback time.
 		labels := m.Labels
 		m.Labels = make([]datasource.Label, 0)
 		// drop all extra labels, so hash key will
-		// be identical to timeseries received in Eval
+		// be identical to timeseries received in Exec
 		for _, l := range labels {
 			if l.Name == alertNameLabel {
 				continue
--- a/app/vmalert/rule_test.go
+++ b/app/vmalert/rule_test.go
@ -296,16 +296,14 @@ func TestRule_Exec(t *testing.T) {
 			},
 		},
 		{
-			newTestRule("for-pending=>inactive", time.Millisecond),
+			newTestRule("for-pending=>empty", time.Second),
 			[][]datasource.Metric{
 				{metricWithLabels(t, "name", "foo")},
 				{metricWithLabels(t, "name", "foo")},
-				// empty step to reset pending alerts
+				// empty step to reset and delete pending alerts
 				{},
 			},
-			map[uint64]*notifier.Alert{
-				hash(metricWithLabels(t, "name", "foo")): {State: notifier.StateInactive},
-			},
+			map[uint64]*notifier.Alert{},
 		},
 		{
 			newTestRule("for-pending=>firing=>inactive", time.Millisecond),