VictoriaMetrics/app/vmalert/rule/alerting_test.go
Roman Khavronenko a3e198588f
vmalert: set ActiveAt to evaluation timestamp in newAlert fn ()
The change fixes flaky test `TestAlertingRule_Exec` which has dependency on the actual timestamps,
which resulted into inaccurate test states:
https://github.com/VictoriaMetrics/VictoriaMetrics/actions/runs/7608452967/job/20717699688

Signed-off-by: hagen1778 <roman@victoriametrics.com>
2024-01-29 17:30:14 +01:00

1202 lines
40 KiB
Go

package rule
import (
"context"
"errors"
"fmt"
"reflect"
"sort"
"strings"
"sync"
"testing"
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/config"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/utils"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/promutils"
)
func TestAlertingRule_ToTimeSeries(t *testing.T) {
timestamp := time.Now()
testCases := []struct {
rule *AlertingRule
alert *notifier.Alert
expTS []prompbmarshal.TimeSeries
}{
{
newTestAlertingRule("instant", 0),
&notifier.Alert{State: notifier.StateFiring, ActiveAt: timestamp.Add(time.Second)},
[]prompbmarshal.TimeSeries{
newTimeSeries([]float64{1}, []int64{timestamp.UnixNano()}, map[string]string{
"__name__": alertMetricName,
alertStateLabel: notifier.StateFiring.String(),
}),
newTimeSeries([]float64{float64(timestamp.Add(time.Second).Unix())},
[]int64{timestamp.UnixNano()},
map[string]string{
"__name__": alertForStateMetricName,
}),
},
},
{
newTestAlertingRule("instant extra labels", 0),
&notifier.Alert{State: notifier.StateFiring, ActiveAt: timestamp.Add(time.Second),
Labels: map[string]string{
"job": "foo",
"instance": "bar",
}},
[]prompbmarshal.TimeSeries{
newTimeSeries([]float64{1}, []int64{timestamp.UnixNano()}, map[string]string{
"__name__": alertMetricName,
alertStateLabel: notifier.StateFiring.String(),
"job": "foo",
"instance": "bar",
}),
newTimeSeries([]float64{float64(timestamp.Add(time.Second).Unix())},
[]int64{timestamp.UnixNano()},
map[string]string{
"__name__": alertForStateMetricName,
"job": "foo",
"instance": "bar",
}),
},
},
{
newTestAlertingRule("instant labels override", 0),
&notifier.Alert{State: notifier.StateFiring, ActiveAt: timestamp.Add(time.Second),
Labels: map[string]string{
alertStateLabel: "foo",
"__name__": "bar",
}},
[]prompbmarshal.TimeSeries{
newTimeSeries([]float64{1}, []int64{timestamp.UnixNano()}, map[string]string{
"__name__": alertMetricName,
alertStateLabel: notifier.StateFiring.String(),
}),
newTimeSeries([]float64{float64(timestamp.Add(time.Second).Unix())},
[]int64{timestamp.UnixNano()},
map[string]string{
"__name__": alertForStateMetricName,
alertStateLabel: "foo",
}),
},
},
{
newTestAlertingRule("for", time.Second),
&notifier.Alert{State: notifier.StateFiring, ActiveAt: timestamp.Add(time.Second)},
[]prompbmarshal.TimeSeries{
newTimeSeries([]float64{1}, []int64{timestamp.UnixNano()}, map[string]string{
"__name__": alertMetricName,
alertStateLabel: notifier.StateFiring.String(),
}),
newTimeSeries([]float64{float64(timestamp.Add(time.Second).Unix())},
[]int64{timestamp.UnixNano()},
map[string]string{
"__name__": alertForStateMetricName,
}),
},
},
{
newTestAlertingRule("for pending", 10*time.Second),
&notifier.Alert{State: notifier.StatePending, ActiveAt: timestamp.Add(time.Second)},
[]prompbmarshal.TimeSeries{
newTimeSeries([]float64{1}, []int64{timestamp.UnixNano()}, map[string]string{
"__name__": alertMetricName,
alertStateLabel: notifier.StatePending.String(),
}),
newTimeSeries([]float64{float64(timestamp.Add(time.Second).Unix())},
[]int64{timestamp.UnixNano()},
map[string]string{
"__name__": alertForStateMetricName,
}),
},
},
}
for _, tc := range testCases {
t.Run(tc.rule.Name, func(t *testing.T) {
tc.rule.alerts[tc.alert.ID] = tc.alert
tss := tc.rule.toTimeSeries(timestamp.Unix())
if err := compareTimeSeries(t, tc.expTS, tss); err != nil {
t.Fatalf("timeseries missmatch: %s", err)
}
})
}
}
func TestAlertingRule_Exec(t *testing.T) {
const defaultStep = 5 * time.Millisecond
type testAlert struct {
labels []string
alert *notifier.Alert
}
testCases := []struct {
rule *AlertingRule
steps [][]datasource.Metric
expAlerts map[int][]testAlert
}{
{
newTestAlertingRule("empty", 0),
[][]datasource.Metric{},
nil,
},
{
newTestAlertingRule("empty labels", 0),
[][]datasource.Metric{
{datasource.Metric{Values: []float64{1}, Timestamps: []int64{1}}},
},
map[int][]testAlert{
0: {{alert: &notifier.Alert{State: notifier.StateFiring}}},
},
},
{
newTestAlertingRule("single-firing=>inactive=>firing=>inactive=>inactive", 0),
[][]datasource.Metric{
{metricWithLabels(t, "name", "foo")},
{},
{metricWithLabels(t, "name", "foo")},
{},
{},
},
map[int][]testAlert{
0: {{labels: []string{"name", "foo"}, alert: &notifier.Alert{State: notifier.StateFiring}}},
1: {{labels: []string{"name", "foo"}, alert: &notifier.Alert{State: notifier.StateInactive}}},
2: {{labels: []string{"name", "foo"}, alert: &notifier.Alert{State: notifier.StateFiring}}},
3: {{labels: []string{"name", "foo"}, alert: &notifier.Alert{State: notifier.StateInactive}}},
4: {{labels: []string{"name", "foo"}, alert: &notifier.Alert{State: notifier.StateInactive}}},
},
},
{
newTestAlertingRule("single-firing=>inactive=>firing=>inactive=>inactive=>firing", 0),
[][]datasource.Metric{
{metricWithLabels(t, "name", "foo")},
{},
{metricWithLabels(t, "name", "foo")},
{},
{},
{metricWithLabels(t, "name", "foo")},
},
map[int][]testAlert{
0: {{labels: []string{"name", "foo"}, alert: &notifier.Alert{State: notifier.StateFiring}}},
1: {{labels: []string{"name", "foo"}, alert: &notifier.Alert{State: notifier.StateInactive}}},
2: {{labels: []string{"name", "foo"}, alert: &notifier.Alert{State: notifier.StateFiring}}},
3: {{labels: []string{"name", "foo"}, alert: &notifier.Alert{State: notifier.StateInactive}}},
4: {{labels: []string{"name", "foo"}, alert: &notifier.Alert{State: notifier.StateInactive}}},
5: {{labels: []string{"name", "foo"}, alert: &notifier.Alert{State: notifier.StateFiring}}},
},
},
{
newTestAlertingRule("multiple-firing", 0),
[][]datasource.Metric{
{
metricWithLabels(t, "name", "foo"),
metricWithLabels(t, "name", "foo1"),
metricWithLabels(t, "name", "foo2"),
},
},
map[int][]testAlert{
0: {
{labels: []string{"name", "foo"}, alert: &notifier.Alert{State: notifier.StateFiring}},
{labels: []string{"name", "foo1"}, alert: &notifier.Alert{State: notifier.StateFiring}},
{labels: []string{"name", "foo2"}, alert: &notifier.Alert{State: notifier.StateFiring}},
},
},
},
{
newTestAlertingRule("multiple-steps-firing", 0),
[][]datasource.Metric{
{metricWithLabels(t, "name", "foo")},
{metricWithLabels(t, "name", "foo1")},
{metricWithLabels(t, "name", "foo2")},
},
// 1: fire first alert
// 2: fire second alert, set first inactive
// 3: fire third alert, set second inactive
map[int][]testAlert{
0: {
{labels: []string{"name", "foo"}, alert: &notifier.Alert{State: notifier.StateFiring}},
},
1: {
{labels: []string{"name", "foo"}, alert: &notifier.Alert{State: notifier.StateInactive}},
{labels: []string{"name", "foo1"}, alert: &notifier.Alert{State: notifier.StateFiring}},
},
2: {
{labels: []string{"name", "foo"}, alert: &notifier.Alert{State: notifier.StateInactive}},
{labels: []string{"name", "foo1"}, alert: &notifier.Alert{State: notifier.StateInactive}},
{labels: []string{"name", "foo2"}, alert: &notifier.Alert{State: notifier.StateFiring}},
},
},
},
{
newTestAlertingRule("for-pending", time.Minute),
[][]datasource.Metric{
{metricWithLabels(t, "name", "foo")},
},
map[int][]testAlert{
0: {{labels: []string{"name", "foo"}, alert: &notifier.Alert{State: notifier.StatePending}}},
},
},
{
newTestAlertingRule("for-fired", defaultStep),
[][]datasource.Metric{
{metricWithLabels(t, "name", "foo")},
{metricWithLabels(t, "name", "foo")},
},
map[int][]testAlert{
0: {{labels: []string{"name", "foo"}, alert: &notifier.Alert{State: notifier.StatePending}}},
1: {{labels: []string{"name", "foo"}, alert: &notifier.Alert{State: notifier.StateFiring}}},
},
},
{
newTestAlertingRule("for-pending=>empty", time.Second),
[][]datasource.Metric{
{metricWithLabels(t, "name", "foo")},
{metricWithLabels(t, "name", "foo")},
// empty step to delete pending alerts
{},
},
map[int][]testAlert{
0: {{labels: []string{"name", "foo"}, alert: &notifier.Alert{State: notifier.StatePending}}},
1: {{labels: []string{"name", "foo"}, alert: &notifier.Alert{State: notifier.StatePending}}},
2: {},
},
},
{
newTestAlertingRule("for-pending=>firing=>inactive=>pending=>firing", defaultStep),
[][]datasource.Metric{
{metricWithLabels(t, "name", "foo")},
{metricWithLabels(t, "name", "foo")},
// empty step to set alert inactive
{},
{metricWithLabels(t, "name", "foo")},
{metricWithLabels(t, "name", "foo")},
},
map[int][]testAlert{
0: {{labels: []string{"name", "foo"}, alert: &notifier.Alert{State: notifier.StatePending}}},
1: {{labels: []string{"name", "foo"}, alert: &notifier.Alert{State: notifier.StateFiring}}},
2: {{labels: []string{"name", "foo"}, alert: &notifier.Alert{State: notifier.StateInactive}}},
3: {{labels: []string{"name", "foo"}, alert: &notifier.Alert{State: notifier.StatePending}}},
4: {{labels: []string{"name", "foo"}, alert: &notifier.Alert{State: notifier.StateFiring}}},
},
},
{
newTestAlertingRuleWithKeepFiring("for-pending=>firing=>keepfiring=>firing", defaultStep, defaultStep),
[][]datasource.Metric{
{metricWithLabels(t, "name", "foo")},
{metricWithLabels(t, "name", "foo")},
// empty step to keep firing
{},
{metricWithLabels(t, "name", "foo")},
},
map[int][]testAlert{
0: {{labels: []string{"name", "foo"}, alert: &notifier.Alert{State: notifier.StatePending}}},
1: {{labels: []string{"name", "foo"}, alert: &notifier.Alert{State: notifier.StateFiring}}},
2: {{labels: []string{"name", "foo"}, alert: &notifier.Alert{State: notifier.StateFiring}}},
3: {{labels: []string{"name", "foo"}, alert: &notifier.Alert{State: notifier.StateFiring}}},
},
},
{
newTestAlertingRuleWithKeepFiring("for-pending=>firing=>keepfiring=>keepfiring=>inactive=>pending=>firing", defaultStep, 2*defaultStep),
[][]datasource.Metric{
{metricWithLabels(t, "name", "foo")},
{metricWithLabels(t, "name", "foo")},
// empty step to keep firing
{},
// another empty step to keep firing
{},
// empty step to set alert inactive
{},
{metricWithLabels(t, "name", "foo")},
{metricWithLabels(t, "name", "foo")},
},
map[int][]testAlert{
0: {{labels: []string{"name", "foo"}, alert: &notifier.Alert{State: notifier.StatePending}}},
1: {{labels: []string{"name", "foo"}, alert: &notifier.Alert{State: notifier.StateFiring}}},
2: {{labels: []string{"name", "foo"}, alert: &notifier.Alert{State: notifier.StateFiring}}},
3: {{labels: []string{"name", "foo"}, alert: &notifier.Alert{State: notifier.StateFiring}}},
4: {{labels: []string{"name", "foo"}, alert: &notifier.Alert{State: notifier.StateInactive}}},
5: {{labels: []string{"name", "foo"}, alert: &notifier.Alert{State: notifier.StatePending}}},
6: {{labels: []string{"name", "foo"}, alert: &notifier.Alert{State: notifier.StateFiring}}},
},
},
}
fakeGroup := Group{Name: "TestRule_Exec"}
for _, tc := range testCases {
t.Run(tc.rule.Name, func(t *testing.T) {
fq := &datasource.FakeQuerier{}
tc.rule.q = fq
tc.rule.GroupID = fakeGroup.ID()
ts := time.Now()
for i, step := range tc.steps {
fq.Reset()
fq.Add(step...)
if _, err := tc.rule.exec(context.TODO(), ts, 0); err != nil {
t.Fatalf("unexpected err: %s", err)
}
// shift the execution timestamp before the next iteration
ts = ts.Add(defaultStep)
if _, ok := tc.expAlerts[i]; !ok {
continue
}
if len(tc.rule.alerts) != len(tc.expAlerts[i]) {
t.Fatalf("evalIndex %d: expected %d alerts; got %d", i, len(tc.expAlerts[i]), len(tc.rule.alerts))
}
expAlerts := make(map[uint64]*notifier.Alert)
for _, ta := range tc.expAlerts[i] {
labels := make(map[string]string)
for i := 0; i < len(ta.labels); i += 2 {
k, v := ta.labels[i], ta.labels[i+1]
labels[k] = v
}
labels[alertNameLabel] = tc.rule.Name
h := hash(labels)
expAlerts[h] = ta.alert
}
for key, exp := range expAlerts {
got, ok := tc.rule.alerts[key]
if !ok {
t.Fatalf("evalIndex %d: expected to have key %d", i, key)
}
if got.State != exp.State {
t.Fatalf("evalIndex %d: expected state %d; got %d", i, exp.State, got.State)
}
}
}
})
}
}
func TestAlertingRule_ExecRange(t *testing.T) {
fakeGroup := Group{Name: "TestRule_ExecRange"}
testCases := []struct {
rule *AlertingRule
data []datasource.Metric
expAlerts []*notifier.Alert
expHoldAlertStateAlerts map[uint64]*notifier.Alert
}{
{
newTestAlertingRule("empty", 0),
[]datasource.Metric{},
nil,
nil,
},
{
newTestAlertingRule("empty labels", 0),
[]datasource.Metric{
{Values: []float64{1}, Timestamps: []int64{1}},
},
[]*notifier.Alert{
{State: notifier.StateFiring, ActiveAt: time.Unix(1, 0)},
},
nil,
},
{
newTestAlertingRule("single-firing", 0),
[]datasource.Metric{
metricWithLabels(t, "name", "foo"),
},
[]*notifier.Alert{
{
Labels: map[string]string{"name": "foo"},
State: notifier.StateFiring,
ActiveAt: time.Unix(1, 0),
},
},
nil,
},
{
newTestAlertingRule("single-firing-on-range", 0),
[]datasource.Metric{
{Values: []float64{1, 1, 1}, Timestamps: []int64{1e3, 2e3, 3e3}},
},
[]*notifier.Alert{
{State: notifier.StateFiring, ActiveAt: time.Unix(1e3, 0)},
{State: notifier.StateFiring, ActiveAt: time.Unix(2e3, 0)},
{State: notifier.StateFiring, ActiveAt: time.Unix(3e3, 0)},
},
nil,
},
{
newTestAlertingRule("for-pending", time.Second),
[]datasource.Metric{
{Values: []float64{1, 1, 1}, Timestamps: []int64{1, 3, 5}},
},
[]*notifier.Alert{
{State: notifier.StatePending, ActiveAt: time.Unix(1, 0)},
{State: notifier.StatePending, ActiveAt: time.Unix(3, 0)},
{State: notifier.StatePending, ActiveAt: time.Unix(5, 0)},
},
map[uint64]*notifier.Alert{hash(map[string]string{"alertname": "for-pending"}): {
GroupID: fakeGroup.ID(),
Name: "for-pending",
Labels: map[string]string{"alertname": "for-pending"},
Annotations: map[string]string{},
State: notifier.StatePending,
ActiveAt: time.Unix(5, 0),
Value: 1,
For: time.Second,
}},
},
{
newTestAlertingRule("for-firing", 3*time.Second),
[]datasource.Metric{
{Values: []float64{1, 1, 1}, Timestamps: []int64{1, 3, 5}},
},
[]*notifier.Alert{
{State: notifier.StatePending, ActiveAt: time.Unix(1, 0)},
{State: notifier.StatePending, ActiveAt: time.Unix(1, 0)},
{State: notifier.StateFiring, ActiveAt: time.Unix(1, 0)},
},
map[uint64]*notifier.Alert{hash(map[string]string{"alertname": "for-firing"}): {
GroupID: fakeGroup.ID(),
Name: "for-firing",
Labels: map[string]string{"alertname": "for-firing"},
Annotations: map[string]string{},
State: notifier.StateFiring,
ActiveAt: time.Unix(1, 0),
Start: time.Unix(5, 0),
Value: 1,
For: 3 * time.Second,
}},
},
{
newTestAlertingRule("for-hold-pending", time.Second),
[]datasource.Metric{
{Values: []float64{1, 1, 1}, Timestamps: []int64{1, 2, 5}},
},
[]*notifier.Alert{
{State: notifier.StatePending, ActiveAt: time.Unix(1, 0)},
{State: notifier.StateFiring, ActiveAt: time.Unix(1, 0)},
{State: notifier.StatePending, ActiveAt: time.Unix(5, 0)},
},
map[uint64]*notifier.Alert{hash(map[string]string{"alertname": "for-hold-pending"}): {
GroupID: fakeGroup.ID(),
Name: "for-hold-pending",
Labels: map[string]string{"alertname": "for-hold-pending"},
Annotations: map[string]string{},
State: notifier.StatePending,
ActiveAt: time.Unix(5, 0),
Value: 1,
For: time.Second,
}},
},
{
newTestAlertingRuleWithEvalInterval("firing=>inactive=>inactive=>firing=>firing", 0, time.Second),
[]datasource.Metric{
{Values: []float64{1, 1, 1, 1}, Timestamps: []int64{1, 4, 5, 6}},
},
[]*notifier.Alert{
{State: notifier.StateFiring, ActiveAt: time.Unix(1, 0)},
// It is expected for ActiveAT to remain the same while rule continues to fire in each iteration
{State: notifier.StateFiring, ActiveAt: time.Unix(4, 0)},
{State: notifier.StateFiring, ActiveAt: time.Unix(4, 0)},
{State: notifier.StateFiring, ActiveAt: time.Unix(4, 0)},
},
nil,
},
{
newTestAlertingRule("for=>pending=>firing=>pending=>firing=>pending", time.Second),
[]datasource.Metric{
{Values: []float64{1, 1, 1, 1, 1}, Timestamps: []int64{1, 2, 5, 6, 20}},
},
[]*notifier.Alert{
{State: notifier.StatePending, ActiveAt: time.Unix(1, 0)},
{State: notifier.StateFiring, ActiveAt: time.Unix(1, 0)},
{State: notifier.StatePending, ActiveAt: time.Unix(5, 0)},
{State: notifier.StateFiring, ActiveAt: time.Unix(5, 0)},
{State: notifier.StatePending, ActiveAt: time.Unix(20, 0)},
},
nil,
},
{
newTestAlertingRule("multi-series", 3*time.Second),
[]datasource.Metric{
{Values: []float64{1, 1, 1}, Timestamps: []int64{1, 3, 5}},
{
Values: []float64{1, 1}, Timestamps: []int64{1, 5},
Labels: []datasource.Label{{Name: "foo", Value: "bar"}},
},
},
[]*notifier.Alert{
{State: notifier.StatePending, ActiveAt: time.Unix(1, 0)},
{State: notifier.StatePending, ActiveAt: time.Unix(1, 0)},
{State: notifier.StateFiring, ActiveAt: time.Unix(1, 0)},
{
State: notifier.StatePending, ActiveAt: time.Unix(1, 0),
Labels: map[string]string{
"foo": "bar",
},
},
{
State: notifier.StatePending, ActiveAt: time.Unix(5, 0),
Labels: map[string]string{
"foo": "bar",
},
},
},
map[uint64]*notifier.Alert{
hash(map[string]string{"alertname": "multi-series"}): {
GroupID: fakeGroup.ID(),
Name: "multi-series",
Labels: map[string]string{"alertname": "multi-series"},
Annotations: map[string]string{},
State: notifier.StateFiring,
ActiveAt: time.Unix(1, 0),
Start: time.Unix(5, 0),
Value: 1,
For: 3 * time.Second,
},
hash(map[string]string{"alertname": "multi-series", "foo": "bar"}): {
GroupID: fakeGroup.ID(),
Name: "multi-series",
Labels: map[string]string{"alertname": "multi-series", "foo": "bar"},
Annotations: map[string]string{},
State: notifier.StatePending,
ActiveAt: time.Unix(5, 0),
Value: 1,
For: 3 * time.Second,
},
},
},
{
newTestRuleWithLabels("multi-series-firing", "source", "vm"),
[]datasource.Metric{
{Values: []float64{1, 1}, Timestamps: []int64{1, 100}},
{
Values: []float64{1, 1}, Timestamps: []int64{1, 5},
Labels: []datasource.Label{{Name: "foo", Value: "bar"}},
},
},
[]*notifier.Alert{
{State: notifier.StateFiring, ActiveAt: time.Unix(1, 0),
Labels: map[string]string{
"source": "vm",
}},
{State: notifier.StateFiring, ActiveAt: time.Unix(100, 0),
Labels: map[string]string{
"source": "vm",
}},
//
{State: notifier.StateFiring, ActiveAt: time.Unix(1, 0),
Labels: map[string]string{
"foo": "bar",
"source": "vm",
}},
{State: notifier.StateFiring, ActiveAt: time.Unix(5, 0),
Labels: map[string]string{
"foo": "bar",
"source": "vm",
}},
},
nil,
},
}
for _, tc := range testCases {
t.Run(tc.rule.Name, func(t *testing.T) {
fq := &datasource.FakeQuerier{}
tc.rule.q = fq
tc.rule.GroupID = fakeGroup.ID()
fq.Add(tc.data...)
gotTS, err := tc.rule.execRange(context.TODO(), time.Unix(1, 0), time.Unix(5, 0))
if err != nil {
t.Fatalf("unexpected err: %s", err)
}
var expTS []prompbmarshal.TimeSeries
var j int
for _, series := range tc.data {
for _, timestamp := range series.Timestamps {
a := tc.expAlerts[j]
if a.Labels == nil {
a.Labels = make(map[string]string)
}
a.Labels[alertNameLabel] = tc.rule.Name
expTS = append(expTS, tc.rule.alertToTimeSeries(a, timestamp)...)
j++
}
}
if len(gotTS) != len(expTS) {
t.Fatalf("expected %d time series; got %d", len(expTS), len(gotTS))
}
for i := range expTS {
got, exp := gotTS[i], expTS[i]
if !reflect.DeepEqual(got, exp) {
t.Fatalf("%d: expected \n%v but got \n%v", i, exp, got)
}
}
if tc.expHoldAlertStateAlerts != nil {
if !reflect.DeepEqual(tc.expHoldAlertStateAlerts, tc.rule.alerts) {
t.Fatalf("expected hold alerts state: \n%v but got \n%v", tc.expHoldAlertStateAlerts, tc.rule.alerts)
}
}
})
}
}
func TestGroup_Restore(t *testing.T) {
defaultTS := time.Now()
fqr := &datasource.FakeQuerierWithRegistry{}
fn := func(rules []config.Rule, expAlerts map[uint64]*notifier.Alert) {
t.Helper()
defer fqr.Reset()
for _, r := range rules {
fqr.Set(r.Expr, metricWithValueAndLabels(t, 0, "__name__", r.Alert))
}
fg := NewGroup(config.Group{Name: "TestRestore", Rules: rules}, fqr, time.Second, nil)
wg := sync.WaitGroup{}
wg.Add(1)
go func() {
nts := func() []notifier.Notifier { return []notifier.Notifier{&notifier.FakeNotifier{}} }
fg.Start(context.Background(), nts, nil, fqr)
wg.Done()
}()
fg.Close()
wg.Wait()
gotAlerts := make(map[uint64]*notifier.Alert)
for _, rs := range fg.Rules {
alerts := rs.(*AlertingRule).alerts
for k, v := range alerts {
if !v.Restored {
// set not restored alerts to predictable timestamp
v.ActiveAt = defaultTS
}
gotAlerts[k] = v
}
}
if len(gotAlerts) != len(expAlerts) {
t.Fatalf("expected %d alerts; got %d", len(expAlerts), len(gotAlerts))
}
for key, exp := range expAlerts {
got, ok := gotAlerts[key]
if !ok {
t.Fatalf("expected to have key %d", key)
}
if got.State != notifier.StatePending {
t.Fatalf("expected state %d; got %d", notifier.StatePending, got.State)
}
if got.ActiveAt != exp.ActiveAt {
t.Fatalf("expected ActiveAt %v; got %v", exp.ActiveAt, got.ActiveAt)
}
if got.Name != exp.Name {
t.Fatalf("expected alertname %q; got %q", exp.Name, got.Name)
}
}
}
stateMetric := func(name string, value time.Time, labels ...string) datasource.Metric {
labels = append(labels, "__name__", alertForStateMetricName)
labels = append(labels, alertNameLabel, name)
labels = append(labels, alertGroupNameLabel, "TestRestore")
return metricWithValueAndLabels(t, float64(value.Unix()), labels...)
}
// one active alert, no previous state
fn(
[]config.Rule{{Alert: "foo", Expr: "foo", For: promutils.NewDuration(time.Second)}},
map[uint64]*notifier.Alert{
hash(map[string]string{alertNameLabel: "foo", alertGroupNameLabel: "TestRestore"}): {
Name: "foo",
ActiveAt: defaultTS,
},
})
fqr.Reset()
// one active alert with state restore
ts := time.Now().Truncate(time.Hour)
fqr.Set(`last_over_time(ALERTS_FOR_STATE{alertgroup="TestRestore",alertname="foo"}[3600s])`,
stateMetric("foo", ts))
fn(
[]config.Rule{{Alert: "foo", Expr: "foo", For: promutils.NewDuration(time.Second)}},
map[uint64]*notifier.Alert{
hash(map[string]string{alertNameLabel: "foo", alertGroupNameLabel: "TestRestore"}): {
Name: "foo",
ActiveAt: ts,
},
})
// two rules, two active alerts, one with state restored
ts = time.Now().Truncate(time.Hour)
fqr.Set(`last_over_time(ALERTS_FOR_STATE{alertgroup="TestRestore",alertname="bar"}[3600s])`,
stateMetric("bar", ts))
fn(
[]config.Rule{
{Alert: "foo", Expr: "foo", For: promutils.NewDuration(time.Second)},
{Alert: "bar", Expr: "bar", For: promutils.NewDuration(time.Second)},
},
map[uint64]*notifier.Alert{
hash(map[string]string{alertNameLabel: "foo", alertGroupNameLabel: "TestRestore"}): {
Name: "foo",
ActiveAt: defaultTS,
},
hash(map[string]string{alertNameLabel: "bar", alertGroupNameLabel: "TestRestore"}): {
Name: "bar",
ActiveAt: ts,
},
})
// two rules, two active alerts, two with state restored
ts = time.Now().Truncate(time.Hour)
fqr.Set(`last_over_time(ALERTS_FOR_STATE{alertgroup="TestRestore",alertname="foo"}[3600s])`,
stateMetric("foo", ts))
fqr.Set(`last_over_time(ALERTS_FOR_STATE{alertgroup="TestRestore",alertname="bar"}[3600s])`,
stateMetric("bar", ts))
fn(
[]config.Rule{
{Alert: "foo", Expr: "foo", For: promutils.NewDuration(time.Second)},
{Alert: "bar", Expr: "bar", For: promutils.NewDuration(time.Second)},
},
map[uint64]*notifier.Alert{
hash(map[string]string{alertNameLabel: "foo", alertGroupNameLabel: "TestRestore"}): {
Name: "foo",
ActiveAt: ts,
},
hash(map[string]string{alertNameLabel: "bar", alertGroupNameLabel: "TestRestore"}): {
Name: "bar",
ActiveAt: ts,
},
})
// one active alert but wrong state restore
ts = time.Now().Truncate(time.Hour)
fqr.Set(`last_over_time(ALERTS_FOR_STATE{alertname="bar",alertgroup="TestRestore"}[3600s])`,
stateMetric("wrong alert", ts))
fn(
[]config.Rule{{Alert: "foo", Expr: "foo", For: promutils.NewDuration(time.Second)}},
map[uint64]*notifier.Alert{
hash(map[string]string{alertNameLabel: "foo", alertGroupNameLabel: "TestRestore"}): {
Name: "foo",
ActiveAt: defaultTS,
},
})
// one active alert with labels
ts = time.Now().Truncate(time.Hour)
fqr.Set(`last_over_time(ALERTS_FOR_STATE{alertgroup="TestRestore",alertname="foo",env="dev"}[3600s])`,
stateMetric("foo", ts, "env", "dev"))
fn(
[]config.Rule{{Alert: "foo", Expr: "foo", Labels: map[string]string{"env": "dev"}, For: promutils.NewDuration(time.Second)}},
map[uint64]*notifier.Alert{
hash(map[string]string{alertNameLabel: "foo", alertGroupNameLabel: "TestRestore", "env": "dev"}): {
Name: "foo",
ActiveAt: ts,
},
})
// one active alert with restore labels missmatch
ts = time.Now().Truncate(time.Hour)
fqr.Set(`last_over_time(ALERTS_FOR_STATE{alertgroup="TestRestore",alertname="foo",env="dev"}[3600s])`,
stateMetric("foo", ts, "env", "dev", "team", "foo"))
fn(
[]config.Rule{{Alert: "foo", Expr: "foo", Labels: map[string]string{"env": "dev"}, For: promutils.NewDuration(time.Second)}},
map[uint64]*notifier.Alert{
hash(map[string]string{alertNameLabel: "foo", alertGroupNameLabel: "TestRestore", "env": "dev"}): {
Name: "foo",
ActiveAt: defaultTS,
},
})
}
func TestAlertingRule_Exec_Negative(t *testing.T) {
fq := &datasource.FakeQuerier{}
ar := newTestAlertingRule("test", 0)
ar.Labels = map[string]string{"job": "test"}
ar.q = fq
// successful attempt
// label `job` will be overridden by rule extra label, the original value will be reserved by "exported_job"
fq.Add(metricWithValueAndLabels(t, 1, "__name__", "foo", "job", "bar"))
fq.Add(metricWithValueAndLabels(t, 1, "__name__", "foo", "job", "baz"))
_, err := ar.exec(context.TODO(), time.Now(), 0)
if err != nil {
t.Fatal(err)
}
// label `__name__` will be omitted and get duplicated results here
fq.Add(metricWithValueAndLabels(t, 1, "__name__", "foo_1", "job", "bar"))
_, err = ar.exec(context.TODO(), time.Now(), 0)
if !errors.Is(err, errDuplicate) {
t.Fatalf("expected to have %s error; got %s", errDuplicate, err)
}
fq.Reset()
expErr := "connection reset by peer"
fq.SetErr(errors.New(expErr))
_, err = ar.exec(context.TODO(), time.Now(), 0)
if err == nil {
t.Fatalf("expected to get err; got nil")
}
if !strings.Contains(err.Error(), expErr) {
t.Fatalf("expected to get err %q; got %q insterad", expErr, err)
}
}
func TestAlertingRuleLimit(t *testing.T) {
fq := &datasource.FakeQuerier{}
ar := newTestAlertingRule("test", 0)
ar.Labels = map[string]string{"job": "test"}
ar.q = fq
ar.For = time.Minute
testCases := []struct {
limit int
err string
tssNum int
}{
{
limit: 0,
tssNum: 4,
},
{
limit: -1,
tssNum: 4,
},
{
limit: 1,
err: "exec exceeded limit of 1 with 2 alerts",
tssNum: 0,
},
{
limit: 4,
tssNum: 4,
},
}
var (
err error
timestamp = time.Now()
)
fq.Add(metricWithValueAndLabels(t, 1, "__name__", "foo", "job", "bar"))
fq.Add(metricWithValueAndLabels(t, 1, "__name__", "foo", "bar", "job"))
for _, testCase := range testCases {
_, err = ar.exec(context.TODO(), timestamp, testCase.limit)
if err != nil && !strings.EqualFold(err.Error(), testCase.err) {
t.Fatal(err)
}
}
fq.Reset()
}
func TestAlertingRule_Template(t *testing.T) {
testCases := []struct {
rule *AlertingRule
metrics []datasource.Metric
expAlerts map[uint64]*notifier.Alert
}{
{
&AlertingRule{
Name: "common",
Labels: map[string]string{
"region": "east",
},
Annotations: map[string]string{
"summary": `{{ $labels.alertname }}: Too high connection number for "{{ $labels.instance }}"`,
},
alerts: make(map[uint64]*notifier.Alert),
},
[]datasource.Metric{
metricWithValueAndLabels(t, 1, "instance", "foo"),
metricWithValueAndLabels(t, 1, "instance", "bar"),
},
map[uint64]*notifier.Alert{
hash(map[string]string{alertNameLabel: "common", "region": "east", "instance": "foo"}): {
Annotations: map[string]string{
"summary": `common: Too high connection number for "foo"`,
},
Labels: map[string]string{
alertNameLabel: "common",
"region": "east",
"instance": "foo",
},
},
hash(map[string]string{alertNameLabel: "common", "region": "east", "instance": "bar"}): {
Annotations: map[string]string{
"summary": `common: Too high connection number for "bar"`,
},
Labels: map[string]string{
alertNameLabel: "common",
"region": "east",
"instance": "bar",
},
},
},
},
{
&AlertingRule{
Name: "override label",
Labels: map[string]string{
"instance": "{{ $labels.instance }}",
},
Annotations: map[string]string{
"summary": `{{ $labels.__name__ }}: Too high connection number for "{{ $labels.instance }}"`,
"description": `{{ $labels.alertname}}: It is {{ $value }} connections for "{{ $labels.instance }}"`,
},
alerts: make(map[uint64]*notifier.Alert),
},
[]datasource.Metric{
metricWithValueAndLabels(t, 2, "__name__", "first", "instance", "foo", alertNameLabel, "override"),
metricWithValueAndLabels(t, 10, "__name__", "second", "instance", "bar", alertNameLabel, "override"),
},
map[uint64]*notifier.Alert{
hash(map[string]string{alertNameLabel: "override label", "exported_alertname": "override", "instance": "foo"}): {
Labels: map[string]string{
alertNameLabel: "override label",
"exported_alertname": "override",
"instance": "foo",
},
Annotations: map[string]string{
"summary": `first: Too high connection number for "foo"`,
"description": `override: It is 2 connections for "foo"`,
},
},
hash(map[string]string{alertNameLabel: "override label", "exported_alertname": "override", "instance": "bar"}): {
Labels: map[string]string{
alertNameLabel: "override label",
"exported_alertname": "override",
"instance": "bar",
},
Annotations: map[string]string{
"summary": `second: Too high connection number for "bar"`,
"description": `override: It is 10 connections for "bar"`,
},
},
},
},
{
&AlertingRule{
Name: "OriginLabels",
GroupName: "Testing",
Labels: map[string]string{
"instance": "{{ $labels.instance }}",
},
Annotations: map[string]string{
"summary": `Alert "{{ $labels.alertname }}({{ $labels.alertgroup }})" for instance {{ $labels.instance }}`,
},
alerts: make(map[uint64]*notifier.Alert),
},
[]datasource.Metric{
metricWithValueAndLabels(t, 1,
alertNameLabel, "originAlertname",
alertGroupNameLabel, "originGroupname",
"instance", "foo"),
},
map[uint64]*notifier.Alert{
hash(map[string]string{
alertNameLabel: "OriginLabels",
"exported_alertname": "originAlertname",
alertGroupNameLabel: "Testing",
"exported_alertgroup": "originGroupname",
"instance": "foo",
}): {
Labels: map[string]string{
alertNameLabel: "OriginLabels",
"exported_alertname": "originAlertname",
alertGroupNameLabel: "Testing",
"exported_alertgroup": "originGroupname",
"instance": "foo",
},
Annotations: map[string]string{
"summary": `Alert "originAlertname(originGroupname)" for instance foo`,
},
},
},
},
}
fakeGroup := Group{Name: "TestRule_Exec"}
for _, tc := range testCases {
t.Run(tc.rule.Name, func(t *testing.T) {
fq := &datasource.FakeQuerier{}
tc.rule.GroupID = fakeGroup.ID()
tc.rule.q = fq
tc.rule.state = &ruleState{entries: make([]StateEntry, 10)}
fq.Add(tc.metrics...)
if _, err := tc.rule.exec(context.TODO(), time.Now(), 0); err != nil {
t.Fatalf("unexpected err: %s", err)
}
for hash, expAlert := range tc.expAlerts {
gotAlert := tc.rule.alerts[hash]
if gotAlert == nil {
t.Fatalf("alert %d is missing; labels: %v; annotations: %v", hash, expAlert.Labels, expAlert.Annotations)
}
if !reflect.DeepEqual(expAlert.Annotations, gotAlert.Annotations) {
t.Fatalf("expected to have annotations %#v; got %#v", expAlert.Annotations, gotAlert.Annotations)
}
if !reflect.DeepEqual(expAlert.Labels, gotAlert.Labels) {
t.Fatalf("expected to have labels %#v; got %#v", expAlert.Labels, gotAlert.Labels)
}
}
})
}
}
func TestAlertsToSend(t *testing.T) {
ts := time.Now()
f := func(alerts, expAlerts []*notifier.Alert, resolveDuration, resendDelay time.Duration) {
t.Helper()
ar := &AlertingRule{alerts: make(map[uint64]*notifier.Alert)}
for i, a := range alerts {
ar.alerts[uint64(i)] = a
}
gotAlerts := ar.alertsToSend(ts, resolveDuration, resendDelay)
if gotAlerts == nil && expAlerts == nil {
return
}
if len(gotAlerts) != len(expAlerts) {
t.Fatalf("expected to get %d alerts; got %d instead",
len(expAlerts), len(gotAlerts))
}
sort.Slice(expAlerts, func(i, j int) bool {
return expAlerts[i].Name < expAlerts[j].Name
})
sort.Slice(gotAlerts, func(i, j int) bool {
return gotAlerts[i].Name < gotAlerts[j].Name
})
for i, exp := range expAlerts {
got := gotAlerts[i]
if got.LastSent != exp.LastSent {
t.Fatalf("expected LastSent to be %v; got %v", exp.LastSent, got.LastSent)
}
if got.End != exp.End {
t.Fatalf("expected End to be %v; got %v", exp.End, got.End)
}
}
}
f( // send firing alert with custom resolve time
[]*notifier.Alert{{State: notifier.StateFiring}},
[]*notifier.Alert{{LastSent: ts, End: ts.Add(5 * time.Minute)}},
5*time.Minute, time.Minute,
)
f( // resolve inactive alert at the current timestamp
[]*notifier.Alert{{State: notifier.StateInactive, ResolvedAt: ts}},
[]*notifier.Alert{{LastSent: ts, End: ts}},
time.Minute, time.Minute,
)
f( // mixed case of firing and resolved alerts. Names are added for deterministic sorting
[]*notifier.Alert{{Name: "a", State: notifier.StateFiring}, {Name: "b", State: notifier.StateInactive, ResolvedAt: ts}},
[]*notifier.Alert{{Name: "a", LastSent: ts, End: ts.Add(5 * time.Minute)}, {Name: "b", LastSent: ts, End: ts}},
5*time.Minute, time.Minute,
)
f( // mixed case of pending and resolved alerts. Names are added for deterministic sorting
[]*notifier.Alert{{Name: "a", State: notifier.StatePending}, {Name: "b", State: notifier.StateInactive, ResolvedAt: ts}},
[]*notifier.Alert{{Name: "b", LastSent: ts, End: ts}},
5*time.Minute, time.Minute,
)
f( // attempt to send alert that was already sent in the resendDelay interval
[]*notifier.Alert{{State: notifier.StateFiring, LastSent: ts.Add(-time.Second)}},
nil,
time.Minute, time.Minute,
)
f( // attempt to send alert that was sent out of the resendDelay interval
[]*notifier.Alert{{State: notifier.StateFiring, LastSent: ts.Add(-2 * time.Minute)}},
[]*notifier.Alert{{LastSent: ts, End: ts.Add(time.Minute)}},
time.Minute, time.Minute,
)
f( // alert must be sent even if resendDelay interval is 0
[]*notifier.Alert{{State: notifier.StateFiring, LastSent: ts.Add(-time.Second)}},
[]*notifier.Alert{{LastSent: ts, End: ts.Add(time.Minute)}},
time.Minute, 0,
)
f( // inactive alert which has been sent already
[]*notifier.Alert{{State: notifier.StateInactive, LastSent: ts.Add(-time.Second), ResolvedAt: ts.Add(-2 * time.Second)}},
nil,
time.Minute, time.Minute,
)
f( // inactive alert which has been resolved after last send
[]*notifier.Alert{{State: notifier.StateInactive, LastSent: ts.Add(-time.Second), ResolvedAt: ts}},
[]*notifier.Alert{{LastSent: ts, End: ts}},
time.Minute, time.Minute,
)
}
func newTestRuleWithLabels(name string, labels ...string) *AlertingRule {
r := newTestAlertingRule(name, 0)
r.Labels = make(map[string]string)
for i := 0; i < len(labels); i += 2 {
r.Labels[labels[i]] = labels[i+1]
}
return r
}
func newTestAlertingRule(name string, waitFor time.Duration) *AlertingRule {
rule := AlertingRule{
Name: name,
For: waitFor,
EvalInterval: waitFor,
alerts: make(map[uint64]*notifier.Alert),
state: &ruleState{entries: make([]StateEntry, 10)},
metrics: &alertingRuleMetrics{
errors: utils.GetOrCreateCounter(fmt.Sprintf(`vmalert_alerting_rules_errors_total{alertname=%q}`, name)),
},
}
return &rule
}
func newTestAlertingRuleWithEvalInterval(name string, waitFor, evalInterval time.Duration) *AlertingRule {
rule := newTestAlertingRule(name, waitFor)
rule.EvalInterval = evalInterval
return rule
}
func newTestAlertingRuleWithKeepFiring(name string, waitFor, keepFiringFor time.Duration) *AlertingRule {
rule := newTestAlertingRule(name, waitFor)
rule.KeepFiringFor = keepFiringFor
return rule
}
func TestAlertingRule_ToLabels(t *testing.T) {
metric := datasource.Metric{
Labels: []datasource.Label{
{Name: "instance", Value: "0.0.0.0:8800"},
{Name: "group", Value: "vmalert"},
{Name: "alertname", Value: "ConfigurationReloadFailure"},
},
Values: []float64{1},
Timestamps: []int64{time.Now().UnixNano()},
}
ar := &AlertingRule{
Labels: map[string]string{
"instance": "override", // this should override instance with new value
"group": "vmalert", // this shouldn't have effect since value in metric is equal
},
Expr: "sum(vmalert_alerting_rules_error) by(instance, group, alertname) > 0",
Name: "AlertingRulesError",
GroupName: "vmalert",
}
expectedOriginLabels := map[string]string{
"instance": "0.0.0.0:8800",
"group": "vmalert",
"alertname": "ConfigurationReloadFailure",
"alertgroup": "vmalert",
}
expectedProcessedLabels := map[string]string{
"instance": "override",
"exported_instance": "0.0.0.0:8800",
"alertname": "AlertingRulesError",
"exported_alertname": "ConfigurationReloadFailure",
"group": "vmalert",
"alertgroup": "vmalert",
}
ls, err := ar.toLabels(metric, nil)
if err != nil {
t.Fatalf("unexpected error: %s", err)
}
if !reflect.DeepEqual(ls.origin, expectedOriginLabels) {
t.Errorf("origin labels mismatch, got: %v, want: %v", ls.origin, expectedOriginLabels)
}
if !reflect.DeepEqual(ls.processed, expectedProcessedLabels) {
t.Errorf("processed labels mismatch, got: %v, want: %v", ls.processed, expectedProcessedLabels)
}
}