From a2f83115ae697bb2e0eb20e11191aeef2692ada8 Mon Sep 17 00:00:00 2001 From: Roman Khavronenko Date: Thu, 25 Jan 2024 15:42:57 +0100 Subject: [PATCH] app/vmalert: autogenerate `ALERTS_FOR_STATE` time series for alerting rules with `for: 0` (#5680) * app/vmalert: autogenerate `ALERTS_FOR_STATE` time series for alerting rules with `for: 0` Previously, `ALERTS_FOR_STATE` was generated only for alerts with `for > 0`. This behavior differs from Prometheus behavior - it generates ALERTS_FOR_STATE time series for alerting rules with `for: 0` as well. Such time series can be useful for tracking the moment when alerting rule became active. Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5648 https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3056 Signed-off-by: hagen1778 * app/vmalert: support ALERTS_FOR_STATE in `replay` mode Signed-off-by: hagen1778 --------- Signed-off-by: hagen1778 --- app/vmalert/Makefile | 1 + app/vmalert/rule/alerting.go | 26 +++----- app/vmalert/rule/alerting_test.go | 103 +++++++++++++++++++++--------- docs/CHANGELOG.md | 1 + 4 files changed, 85 insertions(+), 46 deletions(-) diff --git a/app/vmalert/Makefile b/app/vmalert/Makefile index 9f200f0c22..0c57ffb299 100644 --- a/app/vmalert/Makefile +++ b/app/vmalert/Makefile @@ -68,6 +68,7 @@ publish-vmalert: test-vmalert: go test -v -race -cover ./app/vmalert -loggerLevel=ERROR + go test -v -race -cover ./app/vmalert/rule go test -v -race -cover ./app/vmalert/templates go test -v -race -cover ./app/vmalert/datasource go test -v -race -cover ./app/vmalert/notifier diff --git a/app/vmalert/rule/alerting.go b/app/vmalert/rule/alerting.go index 7ca4427eb7..be50ef1335 100644 --- a/app/vmalert/rule/alerting.go +++ b/app/vmalert/rule/alerting.go @@ -324,16 +324,6 @@ func (ar *AlertingRule) execRange(ctx context.Context, start, end time.Time) ([] return nil, fmt.Errorf("failed to create alert: %w", err) } - // if alert is instant, For: 0 - if ar.For == 0 { - a.State = notifier.StateFiring - for i := range s.Values { - result = append(result, ar.alertToTimeSeries(a, s.Timestamps[i])...) - } - continue - } - - // if alert with For > 0 prevT := time.Time{} for i := range s.Values { at := time.Unix(s.Timestamps[i], 0) @@ -354,6 +344,10 @@ func (ar *AlertingRule) execRange(ctx context.Context, start, end time.Time) ([] a.Start = at } prevT = at + if ar.For == 0 { + // rules with `for: 0` are always firing when they have Value + a.State = notifier.StateFiring + } result = append(result, ar.alertToTimeSeries(a, s.Timestamps[i])...) // save alert's state on last iteration, so it can be used on the next execRange call @@ -559,9 +553,9 @@ func (ar *AlertingRule) newAlert(m datasource.Metric, ls *labelSet, start time.T } const ( - // alertMetricName is the metric name for synthetic alert timeseries. + // alertMetricName is the metric name for time series reflecting the alert state. alertMetricName = "ALERTS" - // alertForStateMetricName is the metric name for 'for' state of alert. + // alertForStateMetricName is the metric name for time series reflecting the moment of time when alert became active. alertForStateMetricName = "ALERTS_FOR_STATE" // alertNameLabel is the label name indicating the name of an alert. @@ -576,12 +570,10 @@ const ( // alertToTimeSeries converts the given alert with the given timestamp to time series func (ar *AlertingRule) alertToTimeSeries(a *notifier.Alert, timestamp int64) []prompbmarshal.TimeSeries { - var tss []prompbmarshal.TimeSeries - tss = append(tss, alertToTimeSeries(a, timestamp)) - if ar.For > 0 { - tss = append(tss, alertForToTimeSeries(a, timestamp)) + return []prompbmarshal.TimeSeries{ + alertToTimeSeries(a, timestamp), + alertForToTimeSeries(a, timestamp), } - return tss } func alertToTimeSeries(a *notifier.Alert, timestamp int64) prompbmarshal.TimeSeries { diff --git a/app/vmalert/rule/alerting_test.go b/app/vmalert/rule/alerting_test.go index 3e5e3503f9..5b7fb54045 100644 --- a/app/vmalert/rule/alerting_test.go +++ b/app/vmalert/rule/alerting_test.go @@ -28,20 +28,26 @@ func TestAlertingRule_ToTimeSeries(t *testing.T) { }{ { newTestAlertingRule("instant", 0), - ¬ifier.Alert{State: notifier.StateFiring}, + ¬ifier.Alert{State: notifier.StateFiring, ActiveAt: timestamp.Add(time.Second)}, []prompbmarshal.TimeSeries{ newTimeSeries([]float64{1}, []int64{timestamp.UnixNano()}, map[string]string{ "__name__": alertMetricName, alertStateLabel: notifier.StateFiring.String(), }), + newTimeSeries([]float64{float64(timestamp.Add(time.Second).Unix())}, + []int64{timestamp.UnixNano()}, + map[string]string{ + "__name__": alertForStateMetricName, + }), }, }, { newTestAlertingRule("instant extra labels", 0), - ¬ifier.Alert{State: notifier.StateFiring, Labels: map[string]string{ - "job": "foo", - "instance": "bar", - }}, + ¬ifier.Alert{State: notifier.StateFiring, ActiveAt: timestamp.Add(time.Second), + Labels: map[string]string{ + "job": "foo", + "instance": "bar", + }}, []prompbmarshal.TimeSeries{ newTimeSeries([]float64{1}, []int64{timestamp.UnixNano()}, map[string]string{ "__name__": alertMetricName, @@ -49,19 +55,33 @@ func TestAlertingRule_ToTimeSeries(t *testing.T) { "job": "foo", "instance": "bar", }), + newTimeSeries([]float64{float64(timestamp.Add(time.Second).Unix())}, + []int64{timestamp.UnixNano()}, + map[string]string{ + "__name__": alertForStateMetricName, + "job": "foo", + "instance": "bar", + }), }, }, { newTestAlertingRule("instant labels override", 0), - ¬ifier.Alert{State: notifier.StateFiring, Labels: map[string]string{ - alertStateLabel: "foo", - "__name__": "bar", - }}, + ¬ifier.Alert{State: notifier.StateFiring, ActiveAt: timestamp.Add(time.Second), + Labels: map[string]string{ + alertStateLabel: "foo", + "__name__": "bar", + }}, []prompbmarshal.TimeSeries{ newTimeSeries([]float64{1}, []int64{timestamp.UnixNano()}, map[string]string{ "__name__": alertMetricName, alertStateLabel: notifier.StateFiring.String(), }), + newTimeSeries([]float64{float64(timestamp.Add(time.Second).Unix())}, + []int64{timestamp.UnixNano()}, + map[string]string{ + "__name__": alertForStateMetricName, + alertStateLabel: "foo", + }), }, }, { @@ -367,7 +387,7 @@ func TestAlertingRule_ExecRange(t *testing.T) { {Values: []float64{1}, Timestamps: []int64{1}}, }, []*notifier.Alert{ - {State: notifier.StateFiring}, + {State: notifier.StateFiring, ActiveAt: time.Unix(1, 0)}, }, nil, }, @@ -378,8 +398,9 @@ func TestAlertingRule_ExecRange(t *testing.T) { }, []*notifier.Alert{ { - Labels: map[string]string{"name": "foo"}, - State: notifier.StateFiring, + Labels: map[string]string{"name": "foo"}, + State: notifier.StateFiring, + ActiveAt: time.Unix(1, 0), }, }, nil, @@ -390,9 +411,9 @@ func TestAlertingRule_ExecRange(t *testing.T) { {Values: []float64{1, 1, 1}, Timestamps: []int64{1e3, 2e3, 3e3}}, }, []*notifier.Alert{ - {State: notifier.StateFiring}, - {State: notifier.StateFiring}, - {State: notifier.StateFiring}, + {State: notifier.StateFiring, ActiveAt: time.Unix(1e3, 0)}, + {State: notifier.StateFiring, ActiveAt: time.Unix(2e3, 0)}, + {State: notifier.StateFiring, ActiveAt: time.Unix(3e3, 0)}, }, nil, }, @@ -460,6 +481,20 @@ func TestAlertingRule_ExecRange(t *testing.T) { For: time.Second, }}, }, + { + newTestAlertingRuleWithEvalInterval("firing=>inactive=>inactive=>firing=>firing", 0, time.Second), + []datasource.Metric{ + {Values: []float64{1, 1, 1, 1}, Timestamps: []int64{1, 4, 5, 6}}, + }, + []*notifier.Alert{ + {State: notifier.StateFiring, ActiveAt: time.Unix(1, 0)}, + // It is expected for ActiveAT to remain the same while rule continues to fire in each iteration + {State: notifier.StateFiring, ActiveAt: time.Unix(4, 0)}, + {State: notifier.StateFiring, ActiveAt: time.Unix(4, 0)}, + {State: notifier.StateFiring, ActiveAt: time.Unix(4, 0)}, + }, + nil, + }, { newTestAlertingRule("for=>pending=>firing=>pending=>firing=>pending", time.Second), []datasource.Metric{ @@ -534,21 +569,25 @@ func TestAlertingRule_ExecRange(t *testing.T) { }, }, []*notifier.Alert{ - {State: notifier.StateFiring, Labels: map[string]string{ - "source": "vm", - }}, - {State: notifier.StateFiring, Labels: map[string]string{ - "source": "vm", - }}, + {State: notifier.StateFiring, ActiveAt: time.Unix(1, 0), + Labels: map[string]string{ + "source": "vm", + }}, + {State: notifier.StateFiring, ActiveAt: time.Unix(100, 0), + Labels: map[string]string{ + "source": "vm", + }}, // - {State: notifier.StateFiring, Labels: map[string]string{ - "foo": "bar", - "source": "vm", - }}, - {State: notifier.StateFiring, Labels: map[string]string{ - "foo": "bar", - "source": "vm", - }}, + {State: notifier.StateFiring, ActiveAt: time.Unix(1, 0), + Labels: map[string]string{ + "foo": "bar", + "source": "vm", + }}, + {State: notifier.StateFiring, ActiveAt: time.Unix(5, 0), + Labels: map[string]string{ + "foo": "bar", + "source": "vm", + }}, }, nil, }, @@ -1095,6 +1134,12 @@ func newTestAlertingRule(name string, waitFor time.Duration) *AlertingRule { return &rule } +func newTestAlertingRuleWithEvalInterval(name string, waitFor, evalInterval time.Duration) *AlertingRule { + rule := newTestAlertingRule(name, waitFor) + rule.EvalInterval = evalInterval + return rule +} + func newTestAlertingRuleWithKeepFiring(name string, waitFor, keepFiringFor time.Duration) *AlertingRule { rule := newTestAlertingRule(name, waitFor) rule.KeepFiringFor = keepFiringFor diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md index c7f0746202..96d6cdccc6 100644 --- a/docs/CHANGELOG.md +++ b/docs/CHANGELOG.md @@ -70,6 +70,7 @@ The sandbox cluster installation is running under the constant load generated by * BUGFIX: `vmstorage`: properly expire `storage/prefetchedMetricIDs` cache. Previously this cache was never expired, so it could grow big under [high churn rate](https://docs.victoriametrics.com/FAQ.html#what-is-high-churn-rate). This could result in increasing CPU load over time. * BUGFIX: [vmalert](https://docs.victoriametrics.com/vmalert.html): check `-external.url` schema when starting vmalert, must be `http` or `https`. Before, alertmanager could reject alert notifications if `-external.url` contained no or wrong schema. * BUGFIX: [vmalert](https://docs.victoriametrics.com/vmalert.html): automatically add `exported_` prefix for original evaluation result label if it's conflicted with external or reserved one, previously it was overridden. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5161). +* BUGFIX: [vmalert](https://docs.victoriametrics.com/vmalert.html): autogenerate `ALERTS_FOR_STATE` time series for alerting rules with `for: 0`. Previously, `ALERTS_FOR_STATE` was generated only for alerts with `for > 0`. The change aligns with Prometheus behavior. See more details in [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5648). * BUGFIX: [MetricsQL](https://docs.victoriametrics.com/MetricsQL.html): consistently sort results for `q1 or q2` query, so they do not change colors with each refresh in Grafana. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5393). * BUGFIX: [MetricsQL](https://docs.victoriametrics.com/MetricsQL.html): properly return results from [bottomk](https://docs.victoriametrics.com/MetricsQL.html#bottomk) and `bottomk_*()` functions when some of these results contain NaN values. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5506). Thanks to @xiaozongyang for [the fix](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5509). * BUGFIX: [MetricsQL](https://docs.victoriametrics.com/MetricsQL.html): properly handle queries, which wrap [rollup functions](https://docs.victoriametrics.com/MetricsQL.html#rollup-functions) with multiple arguments without explicitly specified lookbehind window in square brackets into [aggregate functions](https://docs.victoriametrics.com/MetricsQL.html#aggregate-functions). For example, `sum(quantile_over_time(0.5, process_resident_memory_bytes))` was resulting to `expecting at least 2 args to ...; got 1 args` error. Thanks to @atykhyy for [the pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5414).