From 11ae1ae92432731989bfe8af1bd7e4e7384719fc Mon Sep 17 00:00:00 2001 From: Dmytro Kozlov Date: Wed, 16 Mar 2022 17:26:33 +0200 Subject: [PATCH] Added resendDelay for alerts (#2296) * vmalert: add support of `resendDelay` flag for alerts Co-authored-by: dmitryk-dk Co-authored-by: hagen1778 --- app/vmalert/README.md | 2 + app/vmalert/alerting.go | 23 ++++++ app/vmalert/alerting_test.go | 71 +++++++++++++++++++ .../config/testdata/rules_interval_good.rules | 12 ++++ app/vmalert/group.go | 35 ++++----- app/vmalert/group_test.go | 30 ++++++-- app/vmalert/helpers_test.go | 9 +++ app/vmalert/main.go | 2 + app/vmalert/notifier/alert.go | 2 + 9 files changed, 158 insertions(+), 28 deletions(-) create mode 100644 app/vmalert/config/testdata/rules_interval_good.rules diff --git a/app/vmalert/README.md b/app/vmalert/README.md index 8b1b5f3a6..8e0aaf5f9 100644 --- a/app/vmalert/README.md +++ b/app/vmalert/README.md @@ -774,6 +774,8 @@ The shortlist of configuration flags is the following: Interval for checking for changes in '-rule' files. By default the checking is disabled. Send SIGHUP signal in order to force config check for changes. DEPRECATED - see '-configCheckInterval' instead -rule.maxResolveDuration duration Limits the maximum duration for automatic alert expiration, which is by default equal to 3 evaluation intervals of the parent group. + -rule.resendDelay duration + Minimum amount of time to wait before resending an alert to notifier -rule.validateExpressions Whether to validate rules expressions via MetricsQL engine (default true) -rule.validateTemplates diff --git a/app/vmalert/alerting.go b/app/vmalert/alerting.go index 34d59ea19..1d833b760 100644 --- a/app/vmalert/alerting.go +++ b/app/vmalert/alerting.go @@ -550,3 +550,26 @@ func (ar *AlertingRule) Restore(ctx context.Context, q datasource.Querier, lookb } return nil } + +// alertsToSend walks through the current alerts of AlertingRule +// and returns only those which should be sent to notifier. +// Isn't concurrent safe. +func (ar *AlertingRule) alertsToSend(ts time.Time, resolveDuration, resendDelay time.Duration) []notifier.Alert { + var alerts []notifier.Alert + for _, a := range ar.alerts { + switch a.State { + case notifier.StateFiring: + if time.Since(a.LastSent) < resendDelay { + continue + } + a.End = ts.Add(resolveDuration) + a.LastSent = ts + alerts = append(alerts, *a) + case notifier.StateInactive: + a.End = ts + a.LastSent = ts + alerts = append(alerts, *a) + } + } + return alerts +} diff --git a/app/vmalert/alerting_test.go b/app/vmalert/alerting_test.go index c72daa213..184e13922 100644 --- a/app/vmalert/alerting_test.go +++ b/app/vmalert/alerting_test.go @@ -4,6 +4,7 @@ import ( "context" "errors" "reflect" + "sort" "strings" "testing" "time" @@ -781,6 +782,76 @@ func TestAlertingRule_Template(t *testing.T) { } } +func TestAlertsToSend(t *testing.T) { + ts := time.Now() + f := func(alerts, expAlerts []*notifier.Alert, resolveDuration, resendDelay time.Duration) { + t.Helper() + ar := &AlertingRule{alerts: make(map[uint64]*notifier.Alert)} + for i, a := range alerts { + ar.alerts[uint64(i)] = a + } + gotAlerts := ar.alertsToSend(ts, resolveDuration, resendDelay) + if gotAlerts == nil && expAlerts == nil { + return + } + if len(gotAlerts) != len(expAlerts) { + t.Fatalf("expected to get %d alerts; got %d instead", + len(expAlerts), len(gotAlerts)) + } + sort.Slice(expAlerts, func(i, j int) bool { + return expAlerts[i].Name < expAlerts[j].Name + }) + sort.Slice(gotAlerts, func(i, j int) bool { + return gotAlerts[i].Name < gotAlerts[j].Name + }) + for i, exp := range expAlerts { + got := gotAlerts[i] + if got.LastSent != exp.LastSent { + t.Fatalf("expected LastSent to be %v; got %v", exp.LastSent, got.LastSent) + } + if got.End != exp.End { + t.Fatalf("expected End to be %v; got %v", exp.End, got.End) + } + } + } + + f( // send firing alert with custom resolve time + []*notifier.Alert{{State: notifier.StateFiring}}, + []*notifier.Alert{{LastSent: ts, End: ts.Add(5 * time.Minute)}}, + 5*time.Minute, time.Minute, + ) + f( // resolve inactive alert at the current timestamp + []*notifier.Alert{{State: notifier.StateInactive}}, + []*notifier.Alert{{LastSent: ts, End: ts}}, + time.Minute, time.Minute, + ) + f( // mixed case of firing and resolved alerts. Names are added for deterministic sorting + []*notifier.Alert{{Name: "a", State: notifier.StateFiring}, {Name: "b", State: notifier.StateInactive}}, + []*notifier.Alert{{Name: "a", LastSent: ts, End: ts.Add(5 * time.Minute)}, {Name: "b", LastSent: ts, End: ts}}, + 5*time.Minute, time.Minute, + ) + f( // mixed case of pending and resolved alerts. Names are added for deterministic sorting + []*notifier.Alert{{Name: "a", State: notifier.StatePending}, {Name: "b", State: notifier.StateInactive}}, + []*notifier.Alert{{Name: "b", LastSent: ts, End: ts}}, + 5*time.Minute, time.Minute, + ) + f( // attempt to send alert that was already sent in the resendDelay interval + []*notifier.Alert{{State: notifier.StateFiring, LastSent: ts.Add(-time.Second)}}, + nil, + time.Minute, time.Minute, + ) + f( // attempt to send alert that was sent out of the resendDelay interval + []*notifier.Alert{{State: notifier.StateFiring, LastSent: ts.Add(-2 * time.Minute)}}, + []*notifier.Alert{{LastSent: ts, End: ts.Add(time.Minute)}}, + time.Minute, time.Minute, + ) + f( // alert must be sent even if resendDelay interval is 0 + []*notifier.Alert{{State: notifier.StateFiring, LastSent: ts.Add(-time.Second)}}, + []*notifier.Alert{{LastSent: ts, End: ts.Add(time.Minute)}}, + time.Minute, 0, + ) +} + func newTestRuleWithLabels(name string, labels ...string) *AlertingRule { r := newTestAlertingRule(name, 0) r.Labels = make(map[string]string) diff --git a/app/vmalert/config/testdata/rules_interval_good.rules b/app/vmalert/config/testdata/rules_interval_good.rules new file mode 100644 index 000000000..0bbb7d21e --- /dev/null +++ b/app/vmalert/config/testdata/rules_interval_good.rules @@ -0,0 +1,12 @@ +groups: + - name: groupTest + interval: 1s + rules: + - alert: VMRows + for: 2s + expr: sum(rate(vm_http_request_errors_total[2s])) > 0 + labels: + label: bar + host: "{{ $labels.instance }}" + annotations: + summary: "{{ $value }}" diff --git a/app/vmalert/group.go b/app/vmalert/group.go index bedf1de67..cc5bf3a37 100644 --- a/app/vmalert/group.go +++ b/app/vmalert/group.go @@ -277,8 +277,7 @@ func (g *Group) start(ctx context.Context, nts func() []notifier.Notifier, rw *r g.metrics.iterationTotal.Inc() iterationStart := time.Now() if len(g.Rules) > 0 { - resolveDuration := getResolveDuration(g.Interval) - errs := e.execConcurrently(ctx, g.Rules, g.Concurrency, resolveDuration) + errs := e.execConcurrently(ctx, g.Rules, g.Concurrency, getResolveDuration(g.Interval)) for err := range errs { if err != nil { logger.Errorf("group %q: %s", g.Name, err) @@ -291,15 +290,18 @@ func (g *Group) start(ctx context.Context, nts func() []notifier.Notifier, rw *r } } -// resolveDuration for alerts is equal to 3 interval evaluations -// so in case if vmalert stops sending updates for some reason, -// notifier could automatically resolve the alert. +// getResolveDuration returns the duration after which firing alert +// can be considered as resolved. func getResolveDuration(groupInterval time.Duration) time.Duration { - resolveInterval := groupInterval * 3 - if *maxResolveDuration > 0 && (resolveInterval > *maxResolveDuration) { - return *maxResolveDuration + delta := *resendDelay + if groupInterval > delta { + delta = groupInterval } - return resolveInterval + resolveDuration := delta * 4 + if *maxResolveDuration > 0 && resolveDuration > *maxResolveDuration { + resolveDuration = *maxResolveDuration + } + return resolveDuration } type executor struct { @@ -370,19 +372,8 @@ func (e *executor) exec(ctx context.Context, rule Rule, resolveDuration time.Dur if !ok { return nil } - var alerts []notifier.Alert - for _, a := range ar.alerts { - switch a.State { - case notifier.StateFiring: - a.End = now.Add(resolveDuration) - alerts = append(alerts, *a) - case notifier.StateInactive: - // set End to execStart to notify - // that it was just resolved - a.End = now - alerts = append(alerts, *a) - } - } + + alerts := ar.alertsToSend(now, resolveDuration, *resendDelay) if len(alerts) < 1 { return nil } diff --git a/app/vmalert/group_test.go b/app/vmalert/group_test.go index 6aa15dd77..d0906a485 100644 --- a/app/vmalert/group_test.go +++ b/app/vmalert/group_test.go @@ -158,10 +158,11 @@ func TestGroupStart(t *testing.T) { if err != nil { t.Fatalf("failed to parse rules: %s", err) } - const evalInterval = time.Millisecond + fs := &fakeQuerier{} fn := &fakeNotifier{} + const evalInterval = time.Millisecond g := newGroup(groups[0], fs, evalInterval, map[string]string{"cluster": "east-1"}) g.Concurrency = 2 @@ -223,6 +224,12 @@ func TestGroupStart(t *testing.T) { expectedAlerts := []notifier.Alert{*alert1, *alert2} compareAlerts(t, expectedAlerts, gotAlerts) + gotAlertsNum := fn.getCounter() + if gotAlertsNum < len(expectedAlerts)*2 { + t.Fatalf("expected to receive at least %d alerts; got %d instead", + len(expectedAlerts)*2, gotAlertsNum) + } + // reset previous data fs.reset() // and set only one datapoint for response @@ -243,18 +250,29 @@ func TestResolveDuration(t *testing.T) { testCases := []struct { groupInterval time.Duration maxDuration time.Duration + resendDelay time.Duration expected time.Duration }{ - {time.Minute, 0, 3 * time.Minute}, - {3 * time.Minute, 0, 9 * time.Minute}, - {time.Minute, 2 * time.Minute, 2 * time.Minute}, - {0, 0, 0}, + {time.Minute, 0, 0, 4 * time.Minute}, + {time.Minute, 0, 2 * time.Minute, 8 * time.Minute}, + {time.Minute, 4 * time.Minute, 4 * time.Minute, 4 * time.Minute}, + {2 * time.Minute, time.Minute, 2 * time.Minute, time.Minute}, + {time.Minute, 2 * time.Minute, 1 * time.Minute, 2 * time.Minute}, + {2 * time.Minute, 0, 1 * time.Minute, 8 * time.Minute}, + {0, 0, 0, 0}, } + defaultResolveDuration := *maxResolveDuration - defer func() { *maxResolveDuration = defaultResolveDuration }() + defaultResendDelay := *resendDelay + defer func() { + *maxResolveDuration = defaultResolveDuration + *resendDelay = defaultResendDelay + }() + for _, tc := range testCases { t.Run(fmt.Sprintf("%v-%v-%v", tc.groupInterval, tc.expected, tc.maxDuration), func(t *testing.T) { *maxResolveDuration = tc.maxDuration + *resendDelay = tc.resendDelay got := getResolveDuration(tc.groupInterval) if got != tc.expected { t.Errorf("expected to have %v; got %v", tc.expected, got) diff --git a/app/vmalert/helpers_test.go b/app/vmalert/helpers_test.go index 5fac5b1e0..ebcdca3c3 100644 --- a/app/vmalert/helpers_test.go +++ b/app/vmalert/helpers_test.go @@ -61,6 +61,8 @@ func (fq *fakeQuerier) Query(_ context.Context, _ string) ([]datasource.Metric, type fakeNotifier struct { sync.Mutex alerts []notifier.Alert + // records number of received alerts in total + counter int } func (*fakeNotifier) Close() {} @@ -68,10 +70,17 @@ func (*fakeNotifier) Addr() string { return "" } func (fn *fakeNotifier) Send(_ context.Context, alerts []notifier.Alert) error { fn.Lock() defer fn.Unlock() + fn.counter += len(alerts) fn.alerts = alerts return nil } +func (fn *fakeNotifier) getCounter() int { + fn.Lock() + defer fn.Unlock() + return fn.counter +} + func (fn *fakeNotifier) getAlerts() []notifier.Alert { fn.Lock() defer fn.Unlock() diff --git a/app/vmalert/main.go b/app/vmalert/main.go index 0d23d56b6..c98650329 100644 --- a/app/vmalert/main.go +++ b/app/vmalert/main.go @@ -47,6 +47,8 @@ Rule files may contain %{ENV_VAR} placeholders, which are substituted by the cor validateExpressions = flag.Bool("rule.validateExpressions", true, "Whether to validate rules expressions via MetricsQL engine") maxResolveDuration = flag.Duration("rule.maxResolveDuration", 0, "Limits the maximum duration for automatic alert expiration, "+ "which is by default equal to 3 evaluation intervals of the parent group.") + resendDelay = flag.Duration("rule.resendDelay", 0, "Minimum amount of time to wait before resending an alert to notifier") + externalURL = flag.String("external.url", "", "External URL is used as alert's source for sent alerts to the notifier") externalAlertSource = flag.String("external.alert.source", "", `External Alert Source allows to override the Source link for alerts sent to AlertManager for cases where you want to build a custom link to Grafana, Prometheus or any other service. eg. 'explore?orgId=1&left=[\"now-1h\",\"now\",\"VictoriaMetrics\",{\"expr\": \"{{$expr|quotesEscape|crlfEscape|queryEscape}}\"},{\"mode\":\"Metrics\"},{\"ui\":[true,true,true,\"none\"]}]'.If empty '/api/v1/:groupID/alertID/status' is used`) diff --git a/app/vmalert/notifier/alert.go b/app/vmalert/notifier/alert.go index 4aa661d24..121babab1 100644 --- a/app/vmalert/notifier/alert.go +++ b/app/vmalert/notifier/alert.go @@ -30,6 +30,8 @@ type Alert struct { Start time.Time // End defines the moment of time when Alert supposed to expire End time.Time + // LastSent defines the moment when Alert was sent last time + LastSent time.Time // Value stores the value returned from evaluating expression from Expr field Value float64 // ID is the unique identifer for the Alert