vmalert: fix alert firing state in replay mode (#5192)

fix possible missing firing states for alerting rules in replay mode
Before if one firing stage is bigger than single query request range, like rule with a big `for`, alerting rule won't able to be detected as firing.

Co-authored-by: hagen1778 <roman@victoriametrics.com>
(cherry picked from commit abcb21aa5e)
This commit is contained in:
Hui Wang 2023-10-30 20:54:18 +08:00 committed by hagen1778
parent f0d10e2004
commit 8a786e5df4
No known key found for this signature in database
GPG Key ID: 3BF75F3741CA9640
3 changed files with 112 additions and 12 deletions

View File

@ -295,24 +295,33 @@ func (ar *AlertingRule) toLabels(m datasource.Metric, qFn templates.QueryFn) (*l
} }
// execRange executes alerting rule on the given time range similarly to exec. // execRange executes alerting rule on the given time range similarly to exec.
// It doesn't update internal states of the Rule and meant to be used just // When making consecutive calls make sure to respect time linearity for start and end params,
// to get time series for backfilling. // as this function modifies AlertingRule alerts state.
// It returns ALERT and ALERT_FOR_STATE time series as result. // It is not thread safe.
// It returns ALERT and ALERT_FOR_STATE time series as a result.
func (ar *AlertingRule) execRange(ctx context.Context, start, end time.Time) ([]prompbmarshal.TimeSeries, error) { func (ar *AlertingRule) execRange(ctx context.Context, start, end time.Time) ([]prompbmarshal.TimeSeries, error) {
res, err := ar.q.QueryRange(ctx, ar.Expr, start, end) res, err := ar.q.QueryRange(ctx, ar.Expr, start, end)
if err != nil { if err != nil {
return nil, err return nil, err
} }
var result []prompbmarshal.TimeSeries var result []prompbmarshal.TimeSeries
holdAlertState := make(map[uint64]*notifier.Alert)
qFn := func(query string) ([]datasource.Metric, error) { qFn := func(query string) ([]datasource.Metric, error) {
return nil, fmt.Errorf("`query` template isn't supported in replay mode") return nil, fmt.Errorf("`query` template isn't supported in replay mode")
} }
for _, s := range res.Data { for _, s := range res.Data {
ls, err := ar.toLabels(s, qFn)
if err != nil {
return nil, fmt.Errorf("failed to expand labels: %s", err)
}
h := hash(ls.processed)
a, err := ar.newAlert(s, nil, time.Time{}, qFn) // initial alert a, err := ar.newAlert(s, nil, time.Time{}, qFn) // initial alert
if err != nil { if err != nil {
return nil, fmt.Errorf("failed to create alert: %w", err) return nil, fmt.Errorf("failed to create alert: %w", err)
} }
if ar.For == 0 { // if alert is instant
// if alert is instant, For: 0
if ar.For == 0 {
a.State = notifier.StateFiring a.State = notifier.StateFiring
for i := range s.Values { for i := range s.Values {
result = append(result, ar.alertToTimeSeries(a, s.Timestamps[i])...) result = append(result, ar.alertToTimeSeries(a, s.Timestamps[i])...)
@ -324,18 +333,32 @@ func (ar *AlertingRule) execRange(ctx context.Context, start, end time.Time) ([]
prevT := time.Time{} prevT := time.Time{}
for i := range s.Values { for i := range s.Values {
at := time.Unix(s.Timestamps[i], 0) at := time.Unix(s.Timestamps[i], 0)
// try to restore alert's state on the first iteration
if at.Equal(start) {
if _, ok := ar.alerts[h]; ok {
a = ar.alerts[h]
prevT = at
}
}
if at.Sub(prevT) > ar.EvalInterval { if at.Sub(prevT) > ar.EvalInterval {
// reset to Pending if there are gaps > EvalInterval between DPs // reset to Pending if there are gaps > EvalInterval between DPs
a.State = notifier.StatePending a.State = notifier.StatePending
a.ActiveAt = at a.ActiveAt = at
} else if at.Sub(a.ActiveAt) >= ar.For { a.Start = time.Time{}
} else if at.Sub(a.ActiveAt) >= ar.For && a.State != notifier.StateFiring {
a.State = notifier.StateFiring a.State = notifier.StateFiring
a.Start = at a.Start = at
} }
prevT = at prevT = at
result = append(result, ar.alertToTimeSeries(a, s.Timestamps[i])...) result = append(result, ar.alertToTimeSeries(a, s.Timestamps[i])...)
// save alert's state on last iteration, so it can be used on the next execRange call
if at.Equal(end) {
holdAlertState[h] = a
}
} }
} }
ar.alerts = holdAlertState
return result, nil return result, nil
} }

View File

@ -346,15 +346,18 @@ func TestAlertingRule_Exec(t *testing.T) {
} }
func TestAlertingRule_ExecRange(t *testing.T) { func TestAlertingRule_ExecRange(t *testing.T) {
fakeGroup := Group{Name: "TestRule_ExecRange"}
testCases := []struct { testCases := []struct {
rule *AlertingRule rule *AlertingRule
data []datasource.Metric data []datasource.Metric
expAlerts []*notifier.Alert expAlerts []*notifier.Alert
expHoldAlertStateAlerts map[uint64]*notifier.Alert
}{ }{
{ {
newTestAlertingRule("empty", 0), newTestAlertingRule("empty", 0),
[]datasource.Metric{}, []datasource.Metric{},
nil, nil,
nil,
}, },
{ {
newTestAlertingRule("empty labels", 0), newTestAlertingRule("empty labels", 0),
@ -364,6 +367,7 @@ func TestAlertingRule_ExecRange(t *testing.T) {
[]*notifier.Alert{ []*notifier.Alert{
{State: notifier.StateFiring}, {State: notifier.StateFiring},
}, },
nil,
}, },
{ {
newTestAlertingRule("single-firing", 0), newTestAlertingRule("single-firing", 0),
@ -376,6 +380,7 @@ func TestAlertingRule_ExecRange(t *testing.T) {
State: notifier.StateFiring, State: notifier.StateFiring,
}, },
}, },
nil,
}, },
{ {
newTestAlertingRule("single-firing-on-range", 0), newTestAlertingRule("single-firing-on-range", 0),
@ -387,6 +392,7 @@ func TestAlertingRule_ExecRange(t *testing.T) {
{State: notifier.StateFiring}, {State: notifier.StateFiring},
{State: notifier.StateFiring}, {State: notifier.StateFiring},
}, },
nil,
}, },
{ {
newTestAlertingRule("for-pending", time.Second), newTestAlertingRule("for-pending", time.Second),
@ -398,6 +404,16 @@ func TestAlertingRule_ExecRange(t *testing.T) {
{State: notifier.StatePending, ActiveAt: time.Unix(3, 0)}, {State: notifier.StatePending, ActiveAt: time.Unix(3, 0)},
{State: notifier.StatePending, ActiveAt: time.Unix(5, 0)}, {State: notifier.StatePending, ActiveAt: time.Unix(5, 0)},
}, },
map[uint64]*notifier.Alert{hash(map[string]string{"alertname": "for-pending"}): {
GroupID: fakeGroup.ID(),
Name: "for-pending",
Labels: map[string]string{"alertname": "for-pending"},
Annotations: map[string]string{},
State: notifier.StatePending,
ActiveAt: time.Unix(5, 0),
Value: 1,
For: time.Second,
}},
}, },
{ {
newTestAlertingRule("for-firing", 3*time.Second), newTestAlertingRule("for-firing", 3*time.Second),
@ -409,6 +425,38 @@ func TestAlertingRule_ExecRange(t *testing.T) {
{State: notifier.StatePending, ActiveAt: time.Unix(1, 0)}, {State: notifier.StatePending, ActiveAt: time.Unix(1, 0)},
{State: notifier.StateFiring, ActiveAt: time.Unix(1, 0)}, {State: notifier.StateFiring, ActiveAt: time.Unix(1, 0)},
}, },
map[uint64]*notifier.Alert{hash(map[string]string{"alertname": "for-firing"}): {
GroupID: fakeGroup.ID(),
Name: "for-firing",
Labels: map[string]string{"alertname": "for-firing"},
Annotations: map[string]string{},
State: notifier.StateFiring,
ActiveAt: time.Unix(1, 0),
Start: time.Unix(5, 0),
Value: 1,
For: 3 * time.Second,
}},
},
{
newTestAlertingRule("for-hold-pending", time.Second),
[]datasource.Metric{
{Values: []float64{1, 1, 1}, Timestamps: []int64{1, 2, 5}},
},
[]*notifier.Alert{
{State: notifier.StatePending, ActiveAt: time.Unix(1, 0)},
{State: notifier.StateFiring, ActiveAt: time.Unix(1, 0)},
{State: notifier.StatePending, ActiveAt: time.Unix(5, 0)},
},
map[uint64]*notifier.Alert{hash(map[string]string{"alertname": "for-hold-pending"}): {
GroupID: fakeGroup.ID(),
Name: "for-hold-pending",
Labels: map[string]string{"alertname": "for-hold-pending"},
Annotations: map[string]string{},
State: notifier.StatePending,
ActiveAt: time.Unix(5, 0),
Value: 1,
For: time.Second,
}},
}, },
{ {
newTestAlertingRule("for=>pending=>firing=>pending=>firing=>pending", time.Second), newTestAlertingRule("for=>pending=>firing=>pending=>firing=>pending", time.Second),
@ -422,9 +470,10 @@ func TestAlertingRule_ExecRange(t *testing.T) {
{State: notifier.StateFiring, ActiveAt: time.Unix(5, 0)}, {State: notifier.StateFiring, ActiveAt: time.Unix(5, 0)},
{State: notifier.StatePending, ActiveAt: time.Unix(20, 0)}, {State: notifier.StatePending, ActiveAt: time.Unix(20, 0)},
}, },
nil,
}, },
{ {
newTestAlertingRule("multi-series-for=>pending=>pending=>firing", 3*time.Second), newTestAlertingRule("multi-series", 3*time.Second),
[]datasource.Metric{ []datasource.Metric{
{Values: []float64{1, 1, 1}, Timestamps: []int64{1, 3, 5}}, {Values: []float64{1, 1, 1}, Timestamps: []int64{1, 3, 5}},
{ {
@ -436,7 +485,6 @@ func TestAlertingRule_ExecRange(t *testing.T) {
{State: notifier.StatePending, ActiveAt: time.Unix(1, 0)}, {State: notifier.StatePending, ActiveAt: time.Unix(1, 0)},
{State: notifier.StatePending, ActiveAt: time.Unix(1, 0)}, {State: notifier.StatePending, ActiveAt: time.Unix(1, 0)},
{State: notifier.StateFiring, ActiveAt: time.Unix(1, 0)}, {State: notifier.StateFiring, ActiveAt: time.Unix(1, 0)},
//
{ {
State: notifier.StatePending, ActiveAt: time.Unix(1, 0), State: notifier.StatePending, ActiveAt: time.Unix(1, 0),
Labels: map[string]string{ Labels: map[string]string{
@ -450,6 +498,29 @@ func TestAlertingRule_ExecRange(t *testing.T) {
}, },
}, },
}, },
map[uint64]*notifier.Alert{
hash(map[string]string{"alertname": "multi-series"}): {
GroupID: fakeGroup.ID(),
Name: "multi-series",
Labels: map[string]string{"alertname": "multi-series"},
Annotations: map[string]string{},
State: notifier.StateFiring,
ActiveAt: time.Unix(1, 0),
Start: time.Unix(5, 0),
Value: 1,
For: 3 * time.Second,
},
hash(map[string]string{"alertname": "multi-series", "foo": "bar"}): {
GroupID: fakeGroup.ID(),
Name: "multi-series",
Labels: map[string]string{"alertname": "multi-series", "foo": "bar"},
Annotations: map[string]string{},
State: notifier.StatePending,
ActiveAt: time.Unix(5, 0),
Value: 1,
For: 3 * time.Second,
},
},
}, },
{ {
newTestRuleWithLabels("multi-series-firing", "source", "vm"), newTestRuleWithLabels("multi-series-firing", "source", "vm"),
@ -477,16 +548,16 @@ func TestAlertingRule_ExecRange(t *testing.T) {
"source": "vm", "source": "vm",
}}, }},
}, },
nil,
}, },
} }
fakeGroup := Group{Name: "TestRule_ExecRange"}
for _, tc := range testCases { for _, tc := range testCases {
t.Run(tc.rule.Name, func(t *testing.T) { t.Run(tc.rule.Name, func(t *testing.T) {
fq := &datasource.FakeQuerier{} fq := &datasource.FakeQuerier{}
tc.rule.q = fq tc.rule.q = fq
tc.rule.GroupID = fakeGroup.ID() tc.rule.GroupID = fakeGroup.ID()
fq.Add(tc.data...) fq.Add(tc.data...)
gotTS, err := tc.rule.execRange(context.TODO(), time.Now(), time.Now()) gotTS, err := tc.rule.execRange(context.TODO(), time.Unix(1, 0), time.Unix(5, 0))
if err != nil { if err != nil {
t.Fatalf("unexpected err: %s", err) t.Fatalf("unexpected err: %s", err)
} }
@ -512,6 +583,11 @@ func TestAlertingRule_ExecRange(t *testing.T) {
t.Fatalf("%d: expected \n%v but got \n%v", i, exp, got) t.Fatalf("%d: expected \n%v but got \n%v", i, exp, got)
} }
} }
if tc.expHoldAlertStateAlerts != nil {
if !reflect.DeepEqual(tc.expHoldAlertStateAlerts, tc.rule.alerts) {
t.Fatalf("expected hold alerts state: \n%v but got \n%v", tc.expHoldAlertStateAlerts, tc.rule.alerts)
}
}
}) })
} }
} }

View File

@ -59,6 +59,7 @@ The sandbox cluster installation is running under the constant load generated by
* BUGFIX: [vmalert](https://docs.victoriametrics.com/vmalert.html): strip sensitive information such as auth headers or passwords from datasource, remote-read, remote-write or notifier URLs in log messages or UI. This behavior is by default and is controlled via `-datasource.showURL`, `-remoteRead.showURL`, `remoteWrite.showURL` or `-notifier.showURL` cmd-line flags. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5044). * BUGFIX: [vmalert](https://docs.victoriametrics.com/vmalert.html): strip sensitive information such as auth headers or passwords from datasource, remote-read, remote-write or notifier URLs in log messages or UI. This behavior is by default and is controlled via `-datasource.showURL`, `-remoteRead.showURL`, `remoteWrite.showURL` or `-notifier.showURL` cmd-line flags. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5044).
* BUGFIX: [vmalert](https://docs.victoriametrics.com/vmalert.html): fix vmalert web UI when running on 32-bit architectures machine. * BUGFIX: [vmalert](https://docs.victoriametrics.com/vmalert.html): fix vmalert web UI when running on 32-bit architectures machine.
* BUGFIX: [vmalert](https://docs.victoriametrics.com/vmalert.html): do not send requests to configured remote systems when `-datasource.*`, `-remoteWrite.*`, `-remoteRead.*` or `-notifier.*` command-line flags refer files with invalid auth configs. Previously such requests were sent without properly set auth headers. Now the requests are sent only after the files are updated with valid auth configs. See [this pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5153). * BUGFIX: [vmalert](https://docs.victoriametrics.com/vmalert.html): do not send requests to configured remote systems when `-datasource.*`, `-remoteWrite.*`, `-remoteRead.*` or `-notifier.*` command-line flags refer files with invalid auth configs. Previously such requests were sent without properly set auth headers. Now the requests are sent only after the files are updated with valid auth configs. See [this pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5153).
* BUGFIX: [vmalert](https://docs.victoriametrics.com/vmalert.html): properly maintain alerts state in [replay mode](https://docs.victoriametrics.com/vmalert.html#rules-backfilling) if alert's `for` param was bigger than replay request range (usually a couple of hours). See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5186) for details.
* BUGFIX: `vmselect`: improve performance and memory usage during query processing on machines with big number of CPU cores. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5087). * BUGFIX: `vmselect`: improve performance and memory usage during query processing on machines with big number of CPU cores. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5087).
* BUGFIX: dashboards: fix vminsert/vmstorage/vmselect metrics filtering when dashboard is used to display data from many sub-clusters with unique job names. Before, only one specific job could have been accounted for component-specific panels, instead of all available jobs for the component. * BUGFIX: dashboards: fix vminsert/vmstorage/vmselect metrics filtering when dashboard is used to display data from many sub-clusters with unique job names. Before, only one specific job could have been accounted for component-specific panels, instead of all available jobs for the component.
* BUGFIX: dashboards/vmalert: apply `desc` sorting in tooltips for vmalert dashboard in order to improve visibility of the outliers on graph. * BUGFIX: dashboards/vmalert: apply `desc` sorting in tooltips for vmalert dashboard in order to improve visibility of the outliers on graph.