From 5494bc02a6cb177cb3d9fb7858cbdf47bafe7f79 Mon Sep 17 00:00:00 2001 From: Roman Khavronenko Date: Mon, 13 Sep 2021 15:48:18 +0300 Subject: [PATCH] vmalert: add flag to limit the max value for auto-resovle duration for alerts (#1609) * vmalert: add flag to limit the max value for auto-resovle duration for alerts The new flag `rule.maxResolveDuration` suppose to limit max value for alert.End param, which is used by notifiers like Alertmanager for alerts auto resolve. https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1586 --- app/vmalert/group.go | 28 ++++++++++++++++++---------- app/vmalert/group_test.go | 25 +++++++++++++++++++++++++ app/vmalert/main.go | 2 ++ app/vmalert/main_test.go | 11 +++++++++-- app/vmalert/notifier/alert.go | 25 +++++++++++++++++-------- 5 files changed, 71 insertions(+), 20 deletions(-) diff --git a/app/vmalert/group.go b/app/vmalert/group.go index 24b19193b..1e5e8c8ac 100644 --- a/app/vmalert/group.go +++ b/app/vmalert/group.go @@ -277,8 +277,8 @@ func (g *Group) start(ctx context.Context, nts []notifier.Notifier, rw *remotewr case <-t.C: g.metrics.iterationTotal.Inc() iterationStart := time.Now() - - errs := e.execConcurrently(ctx, g.Rules, g.Concurrency, g.Interval) + resolveDuration := getResolveDuration(g.Interval) + errs := e.execConcurrently(ctx, g.Rules, g.Concurrency, resolveDuration) for err := range errs { if err != nil { logger.Errorf("group %q: %s", g.Name, err) @@ -290,6 +290,17 @@ func (g *Group) start(ctx context.Context, nts []notifier.Notifier, rw *remotewr } } +// resolveDuration for alerts is equal to 3 interval evaluations +// so in case if vmalert stops sending updates for some reason, +// notifier could automatically resolve the alert. +func getResolveDuration(groupInterval time.Duration) time.Duration { + resolveInterval := groupInterval * 3 + if *maxResolveDuration > 0 && (resolveInterval > *maxResolveDuration) { + return *maxResolveDuration + } + return resolveInterval +} + type executor struct { notifiers []eNotifier rw *remotewrite.Client @@ -301,12 +312,12 @@ type eNotifier struct { alertsSendErrors *counter } -func (e *executor) execConcurrently(ctx context.Context, rules []Rule, concurrency int, interval time.Duration) chan error { +func (e *executor) execConcurrently(ctx context.Context, rules []Rule, concurrency int, resolveDuration time.Duration) chan error { res := make(chan error, len(rules)) if concurrency == 1 { // fast path for _, rule := range rules { - res <- e.exec(ctx, rule, interval) + res <- e.exec(ctx, rule, resolveDuration) } close(res) return res @@ -319,7 +330,7 @@ func (e *executor) execConcurrently(ctx context.Context, rules []Rule, concurren sem <- struct{}{} wg.Add(1) go func(r Rule) { - res <- e.exec(ctx, r, interval) + res <- e.exec(ctx, r, resolveDuration) <-sem wg.Done() }(rule) @@ -339,7 +350,7 @@ var ( remoteWriteErrors = metrics.NewCounter(`vmalert_remotewrite_errors_total`) ) -func (e *executor) exec(ctx context.Context, rule Rule, interval time.Duration) error { +func (e *executor) exec(ctx context.Context, rule Rule, resolveDuration time.Duration) error { execTotal.Inc() tss, err := rule.Exec(ctx) @@ -365,10 +376,7 @@ func (e *executor) exec(ctx context.Context, rule Rule, interval time.Duration) for _, a := range ar.alerts { switch a.State { case notifier.StateFiring: - // set End to execStart + 3 intervals - // so notifier can resolve it automatically if `vmalert` - // won't be able to send resolve for some reason - a.End = time.Now().Add(3 * interval) + a.End = time.Now().Add(resolveDuration) alerts = append(alerts, *a) case notifier.StateInactive: // set End to execStart to notify diff --git a/app/vmalert/group_test.go b/app/vmalert/group_test.go index d717bfea4..629f705d8 100644 --- a/app/vmalert/group_test.go +++ b/app/vmalert/group_test.go @@ -2,6 +2,7 @@ package main import ( "context" + "fmt" "sort" "testing" "time" @@ -235,3 +236,27 @@ func TestGroupStart(t *testing.T) { g.close() <-finished } + +func TestResolveDuration(t *testing.T) { + testCases := []struct { + groupInterval time.Duration + maxDuration time.Duration + expected time.Duration + }{ + {time.Minute, 0, 3 * time.Minute}, + {3 * time.Minute, 0, 9 * time.Minute}, + {time.Minute, 2 * time.Minute, 2 * time.Minute}, + {0, 0, 0}, + } + defaultResolveDuration := *maxResolveDuration + defer func() { *maxResolveDuration = defaultResolveDuration }() + for _, tc := range testCases { + t.Run(fmt.Sprintf("%v-%v-%v", tc.groupInterval, tc.expected, tc.maxDuration), func(t *testing.T) { + *maxResolveDuration = tc.maxDuration + got := getResolveDuration(tc.groupInterval) + if got != tc.expected { + t.Errorf("expected to have %v; got %v", tc.expected, got) + } + }) + } +} diff --git a/app/vmalert/main.go b/app/vmalert/main.go index 27a47b538..fa8ed3753 100644 --- a/app/vmalert/main.go +++ b/app/vmalert/main.go @@ -42,6 +42,8 @@ Rule files may contain %{ENV_VAR} placeholders, which are substituted by the cor validateTemplates = flag.Bool("rule.validateTemplates", true, "Whether to validate annotation and label templates") validateExpressions = flag.Bool("rule.validateExpressions", true, "Whether to validate rules expressions via MetricsQL engine") + maxResolveDuration = flag.Duration("rule.maxResolveDuration", 0, "Limits the maximum duration for automatic alert expiration, "+ + "which is by default equal to 3 evaluation intervals of the parent group.") externalURL = flag.String("external.url", "", "External URL is used as alert's source for sent alerts to the notifier") externalAlertSource = flag.String("external.alert.source", "", `External Alert Source allows to override the Source link for alerts sent to AlertManager for cases where you want to build a custom link to Grafana, Prometheus or any other service. eg. 'explore?orgId=1&left=[\"now-1h\",\"now\",\"VictoriaMetrics\",{\"expr\": \"{{$expr|quotesEscape|crlfEscape|queryEscape}}\"},{\"mode\":\"Metrics\"},{\"ui\":[true,true,true,\"none\"]}]'.If empty '/api/v1/:groupID/alertID/status' is used`) diff --git a/app/vmalert/main_test.go b/app/vmalert/main_test.go index a3d454957..8a4174ce3 100644 --- a/app/vmalert/main_test.go +++ b/app/vmalert/main_test.go @@ -94,14 +94,18 @@ groups: *rulesCheckInterval = 200 * time.Millisecond *rulePath = []string{f.Name()} ctx, cancel := context.WithCancel(context.Background()) - defer cancel() m := &manager{ querierBuilder: &fakeQuerier{}, groups: make(map[uint64]*Group), labels: map[string]string{}, } - go configReload(ctx, m, nil) + + syncCh := make(chan struct{}) + go func() { + configReload(ctx, m, nil) + close(syncCh) + }() lenLocked := func(m *manager) int { m.groupsMu.RLock() @@ -138,6 +142,9 @@ groups: if groupsLen != 1 { // should remain unchanged t.Fatalf("expected to have exactly 1 group loaded; got %d", groupsLen) } + + cancel() + <-syncCh } func writeToFile(t *testing.T, file, b string) { diff --git a/app/vmalert/notifier/alert.go b/app/vmalert/notifier/alert.go index 991ef1db3..d345791e5 100644 --- a/app/vmalert/notifier/alert.go +++ b/app/vmalert/notifier/alert.go @@ -14,17 +14,26 @@ import ( // Alert the triggered alert // TODO: Looks like alert name isn't unique type Alert struct { - GroupID uint64 - Name string - Labels map[string]string + // GroupID contains the ID of the parent rules group + GroupID uint64 + // Name represents Alert name + Name string + // Labels is the list of label-value pairs attached to the Alert + Labels map[string]string + // Annotations is the list of annotations generated on Alert evaluation Annotations map[string]string - State AlertState - - Expr string + // State represents the current state of the Alert + State AlertState + // Expr contains expression that was executed to generate the Alert + Expr string + // Start defines the moment of time when Alert has triggered Start time.Time - End time.Time + // End defines the moment of time when Alert supposed to expire + End time.Time + // Value stores the value returned from evaluating expression from Expr field Value float64 - ID uint64 + // ID is the unique identifer for the Alert + ID uint64 } // AlertState type indicates the Alert state