From 9cdd4696fed7d195b5161a86c05a8079ae51d432 Mon Sep 17 00:00:00 2001 From: Roman Khavronenko Date: Wed, 5 May 2021 08:07:19 +0100 Subject: [PATCH] vmalert: add flag to control behaviour on startup for state restore errors (#1265) Alerting rules now can return specific error type ErrStateRestore to indicate whether restore state procedure failed. Such errors were returned and logged before as well. But now user can specify whether to just log these errors (remoteRead.ignoreRestoreErrors=true) or to stop the process (remoteRead.ignoreRestoreErrors=false). The latter is important when VM isn't ready yet to serve queries from vmalert and it needs to wait. https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1252 --- app/vmalert/alerting.go | 5 ++++- app/vmalert/main.go | 1 + app/vmalert/manager.go | 17 +++++++++++++---- 3 files changed, 18 insertions(+), 5 deletions(-) diff --git a/app/vmalert/alerting.go b/app/vmalert/alerting.go index 75c80f2de..5ecf4da08 100644 --- a/app/vmalert/alerting.go +++ b/app/vmalert/alerting.go @@ -2,6 +2,7 @@ package main import ( "context" + "errors" "fmt" "hash/fnv" "sort" @@ -404,6 +405,8 @@ func alertForToTimeSeries(name string, a *notifier.Alert, timestamp time.Time) p return newTimeSeries(float64(a.Start.Unix()), labels, timestamp) } +var ErrStateRestore = errors.New("failed to restore the state") + // Restore restores the state of active alerts basing on previously written timeseries. // Restore restores only Start field. Field State will be always Pending and supposed // to be updated on next Exec, as well as Value field. @@ -428,7 +431,7 @@ func (ar *AlertingRule) Restore(ctx context.Context, q datasource.Querier, lookb alertForStateMetricName, ar.Name, labelsFilter, int(lookback.Seconds())) qMetrics, err := q.Query(ctx, expr) if err != nil { - return err + return fmt.Errorf("%s: %w", err, ErrStateRestore) } for _, m := range qMetrics { diff --git a/app/vmalert/main.go b/app/vmalert/main.go index a2a5ff75a..886dcb0b8 100644 --- a/app/vmalert/main.go +++ b/app/vmalert/main.go @@ -47,6 +47,7 @@ eg. 'explore?orgId=1&left=[\"now-1h\",\"now\",\"VictoriaMetrics\",{\"expr\": \"{ remoteReadLookBack = flag.Duration("remoteRead.lookback", time.Hour, "Lookback defines how far to look into past for alerts timeseries."+ " For example, if lookback=1h then range from now() to now()-1h will be scanned.") + remoteReadIgnoreRestoreErrors = flag.Bool("remoteRead.ignoreRestoreErrors", true, "Whether to ignore errors from remote storage when restoring alerts state on startup.") dryRun = flag.Bool("dryRun", false, "Whether to check only config files without running vmalert. The rules file are validated. The `-rule` flag must be specified.") ) diff --git a/app/vmalert/manager.go b/app/vmalert/manager.go index cd248f5bd..22de88ae8 100644 --- a/app/vmalert/manager.go +++ b/app/vmalert/manager.go @@ -2,6 +2,7 @@ package main import ( "context" + "errors" "fmt" "strings" "sync" @@ -51,7 +52,12 @@ func (m *manager) AlertAPI(gID, aID uint64) (*APIAlert, error) { } func (m *manager) start(ctx context.Context, path []string, validateTpl, validateExpr bool) error { - return m.update(ctx, path, validateTpl, validateExpr, true) + err := m.update(ctx, path, validateTpl, validateExpr, true) + if *remoteReadIgnoreRestoreErrors && errors.Is(err, ErrStateRestore) { + logger.Errorf("%s", err) + return nil + } + return err } func (m *manager) close() { @@ -64,11 +70,11 @@ func (m *manager) close() { m.wg.Wait() } -func (m *manager) startGroup(ctx context.Context, group *Group, restore bool) { +func (m *manager) startGroup(ctx context.Context, group *Group, restore bool) error { if restore && m.rr != nil { err := group.Restore(ctx, m.rr, *remoteReadLookBack, m.labels) if err != nil { - logger.Errorf("error while restoring state for group %q: %s", group.Name, err) + return fmt.Errorf("error while restoring state for group %q: %w", group.Name, err) } } @@ -79,6 +85,7 @@ func (m *manager) startGroup(ctx context.Context, group *Group, restore bool) { m.wg.Done() }() m.groups[id] = group + return nil } func (m *manager) update(ctx context.Context, path []string, validateTpl, validateExpr, restore bool) error { @@ -117,7 +124,9 @@ func (m *manager) update(ctx context.Context, path []string, validateTpl, valida } } for _, ng := range groupsRegistry { - m.startGroup(ctx, ng, restore) + if err := m.startGroup(ctx, ng, restore); err != nil { + return err + } } m.groupsMu.Unlock()