vmalert: avoid sending resolves for pending alerts (#498)

Before the change we were sending notifications to notifier
if following conditions are met:
* alert is in Fire state
* alert is in Inactive state

We were sending Inactive notifications to resolve alert ASAP. 
Unfortunately, we were sending resolves for Pending alerts that become
Inactive, which is wrong.

In this change we delete alert from the active list if
it was Pending and become Inactive. In this way we now
have Inactive alerts only if they were in state Fire before.
See test change for example.
This commit is contained in:
Roman Khavronenko 2020-05-17 15:13:22 +01:00 committed by Aliaksandr Valialkin
parent e5f5342e18
commit c7f3e58032
3 changed files with 48 additions and 33 deletions

View File

@ -69,7 +69,7 @@ func (g *Group) updateWith(newGroup Group) {
// copy all significant fields.
// alerts state isn't copied since
// it should be updated in next 2 Evals
// it should be updated in next 2 Execs
or.For = nr.For
or.Expr = nr.Expr
or.Labels = nr.Labels
@ -151,30 +151,47 @@ func (g *Group) start(ctx context.Context, interval time.Duration,
var alertsToSend []notifier.Alert
for _, a := range rule.alerts {
if a.State != notifier.StatePending {
switch a.State {
case notifier.StateFiring:
// set End to execStart + 3 intervals
// so notifier can resolve it automatically if `vmalert`
// won't be able to send resolve for some reason
a.End = execStart.Add(3 * interval)
alertsToSend = append(alertsToSend, *a)
pushToRW(rw, rule, a, execStart)
case notifier.StatePending:
pushToRW(rw, rule, a, execStart)
case notifier.StateInactive:
// set End to execStart to notify
// that it was just resolved
a.End = execStart
alertsToSend = append(alertsToSend, *a)
}
if a.State == notifier.StateInactive || rw == nil {
}
if len(alertsToSend) == 0 {
continue
}
tss := rule.AlertToTimeSeries(a, execStart)
for _, ts := range tss {
remoteWriteSent.Inc()
if err := rw.Push(ts); err != nil {
remoteWriteErrors.Inc()
logger.Errorf("failed to push timeseries to remotewrite: %s", err)
}
}
}
if len(alertsToSend) > 0 {
alertsSent.Add(len(alertsToSend))
if err := nr.Send(ctx, alertsToSend); err != nil {
alertsSendErrors.Inc()
logger.Errorf("failed to send alert for rule %q.%q: %s", g.Name, rule.Name, err)
}
}
}
iterationDuration.UpdateDuration(iterationStart)
}
}
}
func pushToRW(rw *remotewrite.Client, rule *Rule, a *notifier.Alert, timestamp time.Time) {
if rw == nil {
return
}
tss := rule.AlertToTimeSeries(a, timestamp)
remoteWriteSent.Add(len(tss))
for _, ts := range tss {
if err := rw.Push(ts); err != nil {
remoteWriteErrors.Inc()
logger.Errorf("failed to push timeseries to remotewrite: %s", err)
}
}
}

View File

@ -71,7 +71,7 @@ func (r *Rule) Exec(ctx context.Context, q datasource.Querier) error {
}
for h, a := range r.alerts {
// cleanup inactive alerts from previous Eval
// cleanup inactive alerts from previous Exec
if a.State == notifier.StateInactive {
delete(r.alerts, h)
}
@ -109,19 +109,19 @@ func (r *Rule) Exec(ctx context.Context, q datasource.Querier) error {
// if alert wasn't updated in this iteration
// means it is resolved already
if _, ok := updated[h]; !ok {
if a.State == notifier.StatePending {
// alert was in Pending state - it is not
// active anymore
delete(r.alerts, h)
continue
}
a.State = notifier.StateInactive
// set endTime to last execution time
// so it can be sent by notifier on next step
a.End = r.lastExecTime
continue
}
if a.State == notifier.StatePending && time.Since(a.Start) >= r.For {
a.State = notifier.StateFiring
alertsFired.Inc()
}
if a.State == notifier.StateFiring {
a.End = r.lastExecTime.Add(3 * *evaluationInterval)
}
}
return nil
}
@ -293,7 +293,7 @@ func newTimeSeries(value float64, labels map[string]string, timestamp time.Time)
// Restore restores the state of active alerts basing on previously written timeseries.
// Restore restores only Start field. Field State will be always Pending and supposed
// to be updated on next Eval, as well as Value field.
// to be updated on next Exec, as well as Value field.
func (r *Rule) Restore(ctx context.Context, q datasource.Querier, lookback time.Duration) error {
if q == nil {
return fmt.Errorf("querier is nil")
@ -313,7 +313,7 @@ func (r *Rule) Restore(ctx context.Context, q datasource.Querier, lookback time.
labels := m.Labels
m.Labels = make([]datasource.Label, 0)
// drop all extra labels, so hash key will
// be identical to timeseries received in Eval
// be identical to timeseries received in Exec
for _, l := range labels {
if l.Name == alertNameLabel {
continue

View File

@ -296,16 +296,14 @@ func TestRule_Exec(t *testing.T) {
},
},
{
newTestRule("for-pending=>inactive", time.Millisecond),
newTestRule("for-pending=>empty", time.Second),
[][]datasource.Metric{
{metricWithLabels(t, "name", "foo")},
{metricWithLabels(t, "name", "foo")},
// empty step to reset pending alerts
// empty step to reset and delete pending alerts
{},
},
map[uint64]*notifier.Alert{
hash(metricWithLabels(t, "name", "foo")): {State: notifier.StateInactive},
},
map[uint64]*notifier.Alert{},
},
{
newTestRule("for-pending=>firing=>inactive", time.Millisecond),