app/vmalert: autogenerate ALERTS_FOR_STATE time series for alerting rules with for: 0 (#5680)

* app/vmalert: autogenerate `ALERTS_FOR_STATE` time series for alerting rules with `for: 0` Previously, `ALERTS_FOR_STATE` was generated only for alerts with `for > 0`. This behavior differs from Prometheus behavior - it generates ALERTS_FOR_STATE time series for alerting rules with `for: 0` as well. Such time series can be useful for tracking the moment when alerting rule became active. Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5648 https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3056 Signed-off-by: hagen1778 <roman@victoriametrics.com> * app/vmalert: support ALERTS_FOR_STATE in `replay` mode Signed-off-by: hagen1778 <roman@victoriametrics.com> --------- Signed-off-by: hagen1778 <roman@victoriametrics.com>
2024-12-15 00:13:30 +01:00 · 2024-01-25 15:42:57 +01:00 · 2024-01-25 15:42:57 +01:00 · a2f83115ae
commit a2f83115ae
parent 362e52f880
4 changed files with 85 additions and 46 deletions
--- a/app/vmalert/Makefile
+++ b/app/vmalert/Makefile
@ -68,6 +68,7 @@ publish-vmalert:
 test-vmalert:
 	go test -v -race -cover ./app/vmalert -loggerLevel=ERROR
 	go test -v -race -cover ./app/vmalert/rule
 	go test -v -race -cover ./app/vmalert/templates
 	go test -v -race -cover ./app/vmalert/datasource
 	go test -v -race -cover ./app/vmalert/notifier
--- a/app/vmalert/rule/alerting.go
+++ b/app/vmalert/rule/alerting.go
@ -324,16 +324,6 @@ func (ar *AlertingRule) execRange(ctx context.Context, start, end time.Time) ([]
 			return nil, fmt.Errorf("failed to create alert: %w", err)
 		}
 		// if alert is instant, For: 0
 		if ar.For == 0 {
 			a.State = notifier.StateFiring
 			for i := range s.Values {
 				result = append(result, ar.alertToTimeSeries(a, s.Timestamps[i])...)
 			}
 			continue
 		}
 		// if alert with For > 0
 		prevT := time.Time{}
 		for i := range s.Values {
 			at := time.Unix(s.Timestamps[i], 0)
@ -354,6 +344,10 @@ func (ar *AlertingRule) execRange(ctx context.Context, start, end time.Time) ([]
 				a.Start = at
 			}
 			prevT = at
 			if ar.For == 0 {
 				// rules with `for: 0` are always firing when they have Value
 				a.State = notifier.StateFiring
 			}
 			result = append(result, ar.alertToTimeSeries(a, s.Timestamps[i])...)
 			// save alert's state on last iteration, so it can be used on the next execRange call
@ -559,9 +553,9 @@ func (ar *AlertingRule) newAlert(m datasource.Metric, ls *labelSet, start time.T
 }
 const (
-	// alertMetricName is the metric name for synthetic alert timeseries.
+	// alertMetricName is the metric name for time series reflecting the alert state.
 	alertMetricName = "ALERTS"
-	// alertForStateMetricName is the metric name for 'for' state of alert.
+	// alertForStateMetricName is the metric name for time series reflecting the moment of time when alert became active.
 	alertForStateMetricName = "ALERTS_FOR_STATE"
 	// alertNameLabel is the label name indicating the name of an alert.
@ -576,12 +570,10 @@ const (
 // alertToTimeSeries converts the given alert with the given timestamp to time series
 func (ar *AlertingRule) alertToTimeSeries(a *notifier.Alert, timestamp int64) []prompbmarshal.TimeSeries {
-	var tss []prompbmarshal.TimeSeries
+	return []prompbmarshal.TimeSeries{
-	tss = append(tss, alertToTimeSeries(a, timestamp))
+		alertToTimeSeries(a, timestamp),
-	if ar.For > 0 {
+		alertForToTimeSeries(a, timestamp),
 		tss = append(tss, alertForToTimeSeries(a, timestamp))
 	}
 	return tss
 }
 func alertToTimeSeries(a *notifier.Alert, timestamp int64) prompbmarshal.TimeSeries {
--- a/app/vmalert/rule/alerting_test.go
+++ b/app/vmalert/rule/alerting_test.go
@ -28,20 +28,26 @@ func TestAlertingRule_ToTimeSeries(t *testing.T) {
 	}{
 		{
 			newTestAlertingRule("instant", 0),
-			&notifier.Alert{State: notifier.StateFiring},
+			&notifier.Alert{State: notifier.StateFiring, ActiveAt: timestamp.Add(time.Second)},
 			[]prompbmarshal.TimeSeries{
 				newTimeSeries([]float64{1}, []int64{timestamp.UnixNano()}, map[string]string{
 					"__name__":      alertMetricName,
 					alertStateLabel: notifier.StateFiring.String(),
 				}),
 				newTimeSeries([]float64{float64(timestamp.Add(time.Second).Unix())},
 					[]int64{timestamp.UnixNano()},
 					map[string]string{
 						"__name__": alertForStateMetricName,
 					}),
 			},
 		},
 		{
 			newTestAlertingRule("instant extra labels", 0),
-			&notifier.Alert{State: notifier.StateFiring, Labels: map[string]string{
+			&notifier.Alert{State: notifier.StateFiring, ActiveAt: timestamp.Add(time.Second),
-				"job":      "foo",
+				Labels: map[string]string{
-				"instance": "bar",
+					"job":      "foo",
-			}},
+					"instance": "bar",
 				}},
 			[]prompbmarshal.TimeSeries{
 				newTimeSeries([]float64{1}, []int64{timestamp.UnixNano()}, map[string]string{
 					"__name__":      alertMetricName,
@ -49,19 +55,33 @@ func TestAlertingRule_ToTimeSeries(t *testing.T) {
 					"job":           "foo",
 					"instance":      "bar",
 				}),
 				newTimeSeries([]float64{float64(timestamp.Add(time.Second).Unix())},
 					[]int64{timestamp.UnixNano()},
 					map[string]string{
 						"__name__": alertForStateMetricName,
 						"job":      "foo",
 						"instance": "bar",
 					}),
 			},
 		},
 		{
 			newTestAlertingRule("instant labels override", 0),
-			&notifier.Alert{State: notifier.StateFiring, Labels: map[string]string{
+			&notifier.Alert{State: notifier.StateFiring, ActiveAt: timestamp.Add(time.Second),
-				alertStateLabel: "foo",
+				Labels: map[string]string{
-				"__name__":      "bar",
+					alertStateLabel: "foo",
-			}},
+					"__name__":      "bar",
 				}},
 			[]prompbmarshal.TimeSeries{
 				newTimeSeries([]float64{1}, []int64{timestamp.UnixNano()}, map[string]string{
 					"__name__":      alertMetricName,
 					alertStateLabel: notifier.StateFiring.String(),
 				}),
 				newTimeSeries([]float64{float64(timestamp.Add(time.Second).Unix())},
 					[]int64{timestamp.UnixNano()},
 					map[string]string{
 						"__name__":      alertForStateMetricName,
 						alertStateLabel: "foo",
 					}),
 			},
 		},
 		{
@ -367,7 +387,7 @@ func TestAlertingRule_ExecRange(t *testing.T) {
 				{Values: []float64{1}, Timestamps: []int64{1}},
 			},
 			[]*notifier.Alert{
-				{State: notifier.StateFiring},
+				{State: notifier.StateFiring, ActiveAt: time.Unix(1, 0)},
 			},
 			nil,
 		},
@ -378,8 +398,9 @@ func TestAlertingRule_ExecRange(t *testing.T) {
 			},
 			[]*notifier.Alert{
 				{
-					Labels: map[string]string{"name": "foo"},
+					Labels:   map[string]string{"name": "foo"},
-					State:  notifier.StateFiring,
+					State:    notifier.StateFiring,
 					ActiveAt: time.Unix(1, 0),
 				},
 			},
 			nil,
@ -390,9 +411,9 @@ func TestAlertingRule_ExecRange(t *testing.T) {
 				{Values: []float64{1, 1, 1}, Timestamps: []int64{1e3, 2e3, 3e3}},
 			},
 			[]*notifier.Alert{
-				{State: notifier.StateFiring},
+				{State: notifier.StateFiring, ActiveAt: time.Unix(1e3, 0)},
-				{State: notifier.StateFiring},
+				{State: notifier.StateFiring, ActiveAt: time.Unix(2e3, 0)},
-				{State: notifier.StateFiring},
+				{State: notifier.StateFiring, ActiveAt: time.Unix(3e3, 0)},
 			},
 			nil,
 		},
@ -460,6 +481,20 @@ func TestAlertingRule_ExecRange(t *testing.T) {
 				For:         time.Second,
 			}},
 		},
 		{
 			newTestAlertingRuleWithEvalInterval("firing=>inactive=>inactive=>firing=>firing", 0, time.Second),
 			[]datasource.Metric{
 				{Values: []float64{1, 1, 1, 1}, Timestamps: []int64{1, 4, 5, 6}},
 			},
 			[]*notifier.Alert{
 				{State: notifier.StateFiring, ActiveAt: time.Unix(1, 0)},
 				// It is expected for ActiveAT to remain the same while rule continues to fire in each iteration
 				{State: notifier.StateFiring, ActiveAt: time.Unix(4, 0)},
 				{State: notifier.StateFiring, ActiveAt: time.Unix(4, 0)},
 				{State: notifier.StateFiring, ActiveAt: time.Unix(4, 0)},
 			},
 			nil,
 		},
 		{
 			newTestAlertingRule("for=>pending=>firing=>pending=>firing=>pending", time.Second),
 			[]datasource.Metric{
@ -534,21 +569,25 @@ func TestAlertingRule_ExecRange(t *testing.T) {
 				},
 			},
 			[]*notifier.Alert{
-				{State: notifier.StateFiring, Labels: map[string]string{
+				{State: notifier.StateFiring, ActiveAt: time.Unix(1, 0),
-					"source": "vm",
+					Labels: map[string]string{
-				}},
+						"source": "vm",
-				{State: notifier.StateFiring, Labels: map[string]string{
+					}},
-					"source": "vm",
+				{State: notifier.StateFiring, ActiveAt: time.Unix(100, 0),
-				}},
+					Labels: map[string]string{
 						"source": "vm",
 					}},
 				//
-				{State: notifier.StateFiring, Labels: map[string]string{
+				{State: notifier.StateFiring, ActiveAt: time.Unix(1, 0),
-					"foo":    "bar",
+					Labels: map[string]string{
-					"source": "vm",
+						"foo":    "bar",
-				}},
+						"source": "vm",
-				{State: notifier.StateFiring, Labels: map[string]string{
+					}},
-					"foo":    "bar",
+				{State: notifier.StateFiring, ActiveAt: time.Unix(5, 0),
-					"source": "vm",
+					Labels: map[string]string{
-				}},
+						"foo":    "bar",
 						"source": "vm",
 					}},
 			},
 			nil,
 		},
@ -1095,6 +1134,12 @@ func newTestAlertingRule(name string, waitFor time.Duration) *AlertingRule {
 	return &rule
 }
 func newTestAlertingRuleWithEvalInterval(name string, waitFor, evalInterval time.Duration) *AlertingRule {
 	rule := newTestAlertingRule(name, waitFor)
 	rule.EvalInterval = evalInterval
 	return rule
 }
 func newTestAlertingRuleWithKeepFiring(name string, waitFor, keepFiringFor time.Duration) *AlertingRule {
 	rule := newTestAlertingRule(name, waitFor)
 	rule.KeepFiringFor = keepFiringFor
--- a/docs/CHANGELOG.md
+++ b/docs/CHANGELOG.md
@ -70,6 +70,7 @@ The sandbox cluster installation is running under the constant load generated by
 * BUGFIX: `vmstorage`: properly expire `storage/prefetchedMetricIDs` cache. Previously this cache was never expired, so it could grow big under [high churn rate](https://docs.victoriametrics.com/FAQ.html#what-is-high-churn-rate). This could result in increasing CPU load over time.
 * BUGFIX: [vmalert](https://docs.victoriametrics.com/vmalert.html): check `-external.url` schema when starting vmalert, must be `http` or `https`. Before, alertmanager could reject alert notifications if `-external.url` contained no or wrong schema.
 * BUGFIX: [vmalert](https://docs.victoriametrics.com/vmalert.html): automatically add `exported_` prefix for original evaluation result label if it's conflicted with external or reserved one, previously it was overridden. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5161).
 * BUGFIX: [vmalert](https://docs.victoriametrics.com/vmalert.html): autogenerate `ALERTS_FOR_STATE` time series for alerting rules with `for: 0`. Previously, `ALERTS_FOR_STATE` was generated only for alerts with `for > 0`. The change aligns with Prometheus behavior. See more details in [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5648). 
 * BUGFIX: [MetricsQL](https://docs.victoriametrics.com/MetricsQL.html): consistently sort results for `q1 or q2` query, so they do not change colors with each refresh in Grafana. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5393).
 * BUGFIX: [MetricsQL](https://docs.victoriametrics.com/MetricsQL.html): properly return results from [bottomk](https://docs.victoriametrics.com/MetricsQL.html#bottomk) and `bottomk_*()` functions when some of these results contain NaN values. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5506). Thanks to @xiaozongyang for [the fix](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5509).
 * BUGFIX: [MetricsQL](https://docs.victoriametrics.com/MetricsQL.html): properly handle queries, which wrap [rollup functions](https://docs.victoriametrics.com/MetricsQL.html#rollup-functions) with multiple arguments without explicitly specified lookbehind window in square brackets into [aggregate functions](https://docs.victoriametrics.com/MetricsQL.html#aggregate-functions). For example, `sum(quantile_over_time(0.5, process_resident_memory_bytes))` was resulting to `expecting at least 2 args to ...; got 1 args` error. Thanks to @atykhyy for [the pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5414).