mirror of
https://github.com/VictoriaMetrics/VictoriaMetrics.git
synced 2024-12-15 00:13:30 +01:00
app/vmalert: autogenerate ALERTS_FOR_STATE
time series for alerting rules with for: 0
(#5680)
* app/vmalert: autogenerate `ALERTS_FOR_STATE` time series for alerting rules with `for: 0` Previously, `ALERTS_FOR_STATE` was generated only for alerts with `for > 0`. This behavior differs from Prometheus behavior - it generates ALERTS_FOR_STATE time series for alerting rules with `for: 0` as well. Such time series can be useful for tracking the moment when alerting rule became active. Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5648 https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3056 Signed-off-by: hagen1778 <roman@victoriametrics.com> * app/vmalert: support ALERTS_FOR_STATE in `replay` mode Signed-off-by: hagen1778 <roman@victoriametrics.com> --------- Signed-off-by: hagen1778 <roman@victoriametrics.com>
This commit is contained in:
parent
362e52f880
commit
a2f83115ae
@ -68,6 +68,7 @@ publish-vmalert:
|
|||||||
|
|
||||||
test-vmalert:
|
test-vmalert:
|
||||||
go test -v -race -cover ./app/vmalert -loggerLevel=ERROR
|
go test -v -race -cover ./app/vmalert -loggerLevel=ERROR
|
||||||
|
go test -v -race -cover ./app/vmalert/rule
|
||||||
go test -v -race -cover ./app/vmalert/templates
|
go test -v -race -cover ./app/vmalert/templates
|
||||||
go test -v -race -cover ./app/vmalert/datasource
|
go test -v -race -cover ./app/vmalert/datasource
|
||||||
go test -v -race -cover ./app/vmalert/notifier
|
go test -v -race -cover ./app/vmalert/notifier
|
||||||
|
@ -324,16 +324,6 @@ func (ar *AlertingRule) execRange(ctx context.Context, start, end time.Time) ([]
|
|||||||
return nil, fmt.Errorf("failed to create alert: %w", err)
|
return nil, fmt.Errorf("failed to create alert: %w", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
// if alert is instant, For: 0
|
|
||||||
if ar.For == 0 {
|
|
||||||
a.State = notifier.StateFiring
|
|
||||||
for i := range s.Values {
|
|
||||||
result = append(result, ar.alertToTimeSeries(a, s.Timestamps[i])...)
|
|
||||||
}
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
|
|
||||||
// if alert with For > 0
|
|
||||||
prevT := time.Time{}
|
prevT := time.Time{}
|
||||||
for i := range s.Values {
|
for i := range s.Values {
|
||||||
at := time.Unix(s.Timestamps[i], 0)
|
at := time.Unix(s.Timestamps[i], 0)
|
||||||
@ -354,6 +344,10 @@ func (ar *AlertingRule) execRange(ctx context.Context, start, end time.Time) ([]
|
|||||||
a.Start = at
|
a.Start = at
|
||||||
}
|
}
|
||||||
prevT = at
|
prevT = at
|
||||||
|
if ar.For == 0 {
|
||||||
|
// rules with `for: 0` are always firing when they have Value
|
||||||
|
a.State = notifier.StateFiring
|
||||||
|
}
|
||||||
result = append(result, ar.alertToTimeSeries(a, s.Timestamps[i])...)
|
result = append(result, ar.alertToTimeSeries(a, s.Timestamps[i])...)
|
||||||
|
|
||||||
// save alert's state on last iteration, so it can be used on the next execRange call
|
// save alert's state on last iteration, so it can be used on the next execRange call
|
||||||
@ -559,9 +553,9 @@ func (ar *AlertingRule) newAlert(m datasource.Metric, ls *labelSet, start time.T
|
|||||||
}
|
}
|
||||||
|
|
||||||
const (
|
const (
|
||||||
// alertMetricName is the metric name for synthetic alert timeseries.
|
// alertMetricName is the metric name for time series reflecting the alert state.
|
||||||
alertMetricName = "ALERTS"
|
alertMetricName = "ALERTS"
|
||||||
// alertForStateMetricName is the metric name for 'for' state of alert.
|
// alertForStateMetricName is the metric name for time series reflecting the moment of time when alert became active.
|
||||||
alertForStateMetricName = "ALERTS_FOR_STATE"
|
alertForStateMetricName = "ALERTS_FOR_STATE"
|
||||||
|
|
||||||
// alertNameLabel is the label name indicating the name of an alert.
|
// alertNameLabel is the label name indicating the name of an alert.
|
||||||
@ -576,12 +570,10 @@ const (
|
|||||||
|
|
||||||
// alertToTimeSeries converts the given alert with the given timestamp to time series
|
// alertToTimeSeries converts the given alert with the given timestamp to time series
|
||||||
func (ar *AlertingRule) alertToTimeSeries(a *notifier.Alert, timestamp int64) []prompbmarshal.TimeSeries {
|
func (ar *AlertingRule) alertToTimeSeries(a *notifier.Alert, timestamp int64) []prompbmarshal.TimeSeries {
|
||||||
var tss []prompbmarshal.TimeSeries
|
return []prompbmarshal.TimeSeries{
|
||||||
tss = append(tss, alertToTimeSeries(a, timestamp))
|
alertToTimeSeries(a, timestamp),
|
||||||
if ar.For > 0 {
|
alertForToTimeSeries(a, timestamp),
|
||||||
tss = append(tss, alertForToTimeSeries(a, timestamp))
|
|
||||||
}
|
}
|
||||||
return tss
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func alertToTimeSeries(a *notifier.Alert, timestamp int64) prompbmarshal.TimeSeries {
|
func alertToTimeSeries(a *notifier.Alert, timestamp int64) prompbmarshal.TimeSeries {
|
||||||
|
@ -28,20 +28,26 @@ func TestAlertingRule_ToTimeSeries(t *testing.T) {
|
|||||||
}{
|
}{
|
||||||
{
|
{
|
||||||
newTestAlertingRule("instant", 0),
|
newTestAlertingRule("instant", 0),
|
||||||
¬ifier.Alert{State: notifier.StateFiring},
|
¬ifier.Alert{State: notifier.StateFiring, ActiveAt: timestamp.Add(time.Second)},
|
||||||
[]prompbmarshal.TimeSeries{
|
[]prompbmarshal.TimeSeries{
|
||||||
newTimeSeries([]float64{1}, []int64{timestamp.UnixNano()}, map[string]string{
|
newTimeSeries([]float64{1}, []int64{timestamp.UnixNano()}, map[string]string{
|
||||||
"__name__": alertMetricName,
|
"__name__": alertMetricName,
|
||||||
alertStateLabel: notifier.StateFiring.String(),
|
alertStateLabel: notifier.StateFiring.String(),
|
||||||
}),
|
}),
|
||||||
|
newTimeSeries([]float64{float64(timestamp.Add(time.Second).Unix())},
|
||||||
|
[]int64{timestamp.UnixNano()},
|
||||||
|
map[string]string{
|
||||||
|
"__name__": alertForStateMetricName,
|
||||||
|
}),
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
newTestAlertingRule("instant extra labels", 0),
|
newTestAlertingRule("instant extra labels", 0),
|
||||||
¬ifier.Alert{State: notifier.StateFiring, Labels: map[string]string{
|
¬ifier.Alert{State: notifier.StateFiring, ActiveAt: timestamp.Add(time.Second),
|
||||||
"job": "foo",
|
Labels: map[string]string{
|
||||||
"instance": "bar",
|
"job": "foo",
|
||||||
}},
|
"instance": "bar",
|
||||||
|
}},
|
||||||
[]prompbmarshal.TimeSeries{
|
[]prompbmarshal.TimeSeries{
|
||||||
newTimeSeries([]float64{1}, []int64{timestamp.UnixNano()}, map[string]string{
|
newTimeSeries([]float64{1}, []int64{timestamp.UnixNano()}, map[string]string{
|
||||||
"__name__": alertMetricName,
|
"__name__": alertMetricName,
|
||||||
@ -49,19 +55,33 @@ func TestAlertingRule_ToTimeSeries(t *testing.T) {
|
|||||||
"job": "foo",
|
"job": "foo",
|
||||||
"instance": "bar",
|
"instance": "bar",
|
||||||
}),
|
}),
|
||||||
|
newTimeSeries([]float64{float64(timestamp.Add(time.Second).Unix())},
|
||||||
|
[]int64{timestamp.UnixNano()},
|
||||||
|
map[string]string{
|
||||||
|
"__name__": alertForStateMetricName,
|
||||||
|
"job": "foo",
|
||||||
|
"instance": "bar",
|
||||||
|
}),
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
newTestAlertingRule("instant labels override", 0),
|
newTestAlertingRule("instant labels override", 0),
|
||||||
¬ifier.Alert{State: notifier.StateFiring, Labels: map[string]string{
|
¬ifier.Alert{State: notifier.StateFiring, ActiveAt: timestamp.Add(time.Second),
|
||||||
alertStateLabel: "foo",
|
Labels: map[string]string{
|
||||||
"__name__": "bar",
|
alertStateLabel: "foo",
|
||||||
}},
|
"__name__": "bar",
|
||||||
|
}},
|
||||||
[]prompbmarshal.TimeSeries{
|
[]prompbmarshal.TimeSeries{
|
||||||
newTimeSeries([]float64{1}, []int64{timestamp.UnixNano()}, map[string]string{
|
newTimeSeries([]float64{1}, []int64{timestamp.UnixNano()}, map[string]string{
|
||||||
"__name__": alertMetricName,
|
"__name__": alertMetricName,
|
||||||
alertStateLabel: notifier.StateFiring.String(),
|
alertStateLabel: notifier.StateFiring.String(),
|
||||||
}),
|
}),
|
||||||
|
newTimeSeries([]float64{float64(timestamp.Add(time.Second).Unix())},
|
||||||
|
[]int64{timestamp.UnixNano()},
|
||||||
|
map[string]string{
|
||||||
|
"__name__": alertForStateMetricName,
|
||||||
|
alertStateLabel: "foo",
|
||||||
|
}),
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -367,7 +387,7 @@ func TestAlertingRule_ExecRange(t *testing.T) {
|
|||||||
{Values: []float64{1}, Timestamps: []int64{1}},
|
{Values: []float64{1}, Timestamps: []int64{1}},
|
||||||
},
|
},
|
||||||
[]*notifier.Alert{
|
[]*notifier.Alert{
|
||||||
{State: notifier.StateFiring},
|
{State: notifier.StateFiring, ActiveAt: time.Unix(1, 0)},
|
||||||
},
|
},
|
||||||
nil,
|
nil,
|
||||||
},
|
},
|
||||||
@ -378,8 +398,9 @@ func TestAlertingRule_ExecRange(t *testing.T) {
|
|||||||
},
|
},
|
||||||
[]*notifier.Alert{
|
[]*notifier.Alert{
|
||||||
{
|
{
|
||||||
Labels: map[string]string{"name": "foo"},
|
Labels: map[string]string{"name": "foo"},
|
||||||
State: notifier.StateFiring,
|
State: notifier.StateFiring,
|
||||||
|
ActiveAt: time.Unix(1, 0),
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
nil,
|
nil,
|
||||||
@ -390,9 +411,9 @@ func TestAlertingRule_ExecRange(t *testing.T) {
|
|||||||
{Values: []float64{1, 1, 1}, Timestamps: []int64{1e3, 2e3, 3e3}},
|
{Values: []float64{1, 1, 1}, Timestamps: []int64{1e3, 2e3, 3e3}},
|
||||||
},
|
},
|
||||||
[]*notifier.Alert{
|
[]*notifier.Alert{
|
||||||
{State: notifier.StateFiring},
|
{State: notifier.StateFiring, ActiveAt: time.Unix(1e3, 0)},
|
||||||
{State: notifier.StateFiring},
|
{State: notifier.StateFiring, ActiveAt: time.Unix(2e3, 0)},
|
||||||
{State: notifier.StateFiring},
|
{State: notifier.StateFiring, ActiveAt: time.Unix(3e3, 0)},
|
||||||
},
|
},
|
||||||
nil,
|
nil,
|
||||||
},
|
},
|
||||||
@ -460,6 +481,20 @@ func TestAlertingRule_ExecRange(t *testing.T) {
|
|||||||
For: time.Second,
|
For: time.Second,
|
||||||
}},
|
}},
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
newTestAlertingRuleWithEvalInterval("firing=>inactive=>inactive=>firing=>firing", 0, time.Second),
|
||||||
|
[]datasource.Metric{
|
||||||
|
{Values: []float64{1, 1, 1, 1}, Timestamps: []int64{1, 4, 5, 6}},
|
||||||
|
},
|
||||||
|
[]*notifier.Alert{
|
||||||
|
{State: notifier.StateFiring, ActiveAt: time.Unix(1, 0)},
|
||||||
|
// It is expected for ActiveAT to remain the same while rule continues to fire in each iteration
|
||||||
|
{State: notifier.StateFiring, ActiveAt: time.Unix(4, 0)},
|
||||||
|
{State: notifier.StateFiring, ActiveAt: time.Unix(4, 0)},
|
||||||
|
{State: notifier.StateFiring, ActiveAt: time.Unix(4, 0)},
|
||||||
|
},
|
||||||
|
nil,
|
||||||
|
},
|
||||||
{
|
{
|
||||||
newTestAlertingRule("for=>pending=>firing=>pending=>firing=>pending", time.Second),
|
newTestAlertingRule("for=>pending=>firing=>pending=>firing=>pending", time.Second),
|
||||||
[]datasource.Metric{
|
[]datasource.Metric{
|
||||||
@ -534,21 +569,25 @@ func TestAlertingRule_ExecRange(t *testing.T) {
|
|||||||
},
|
},
|
||||||
},
|
},
|
||||||
[]*notifier.Alert{
|
[]*notifier.Alert{
|
||||||
{State: notifier.StateFiring, Labels: map[string]string{
|
{State: notifier.StateFiring, ActiveAt: time.Unix(1, 0),
|
||||||
"source": "vm",
|
Labels: map[string]string{
|
||||||
}},
|
"source": "vm",
|
||||||
{State: notifier.StateFiring, Labels: map[string]string{
|
}},
|
||||||
"source": "vm",
|
{State: notifier.StateFiring, ActiveAt: time.Unix(100, 0),
|
||||||
}},
|
Labels: map[string]string{
|
||||||
|
"source": "vm",
|
||||||
|
}},
|
||||||
//
|
//
|
||||||
{State: notifier.StateFiring, Labels: map[string]string{
|
{State: notifier.StateFiring, ActiveAt: time.Unix(1, 0),
|
||||||
"foo": "bar",
|
Labels: map[string]string{
|
||||||
"source": "vm",
|
"foo": "bar",
|
||||||
}},
|
"source": "vm",
|
||||||
{State: notifier.StateFiring, Labels: map[string]string{
|
}},
|
||||||
"foo": "bar",
|
{State: notifier.StateFiring, ActiveAt: time.Unix(5, 0),
|
||||||
"source": "vm",
|
Labels: map[string]string{
|
||||||
}},
|
"foo": "bar",
|
||||||
|
"source": "vm",
|
||||||
|
}},
|
||||||
},
|
},
|
||||||
nil,
|
nil,
|
||||||
},
|
},
|
||||||
@ -1095,6 +1134,12 @@ func newTestAlertingRule(name string, waitFor time.Duration) *AlertingRule {
|
|||||||
return &rule
|
return &rule
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func newTestAlertingRuleWithEvalInterval(name string, waitFor, evalInterval time.Duration) *AlertingRule {
|
||||||
|
rule := newTestAlertingRule(name, waitFor)
|
||||||
|
rule.EvalInterval = evalInterval
|
||||||
|
return rule
|
||||||
|
}
|
||||||
|
|
||||||
func newTestAlertingRuleWithKeepFiring(name string, waitFor, keepFiringFor time.Duration) *AlertingRule {
|
func newTestAlertingRuleWithKeepFiring(name string, waitFor, keepFiringFor time.Duration) *AlertingRule {
|
||||||
rule := newTestAlertingRule(name, waitFor)
|
rule := newTestAlertingRule(name, waitFor)
|
||||||
rule.KeepFiringFor = keepFiringFor
|
rule.KeepFiringFor = keepFiringFor
|
||||||
|
@ -70,6 +70,7 @@ The sandbox cluster installation is running under the constant load generated by
|
|||||||
* BUGFIX: `vmstorage`: properly expire `storage/prefetchedMetricIDs` cache. Previously this cache was never expired, so it could grow big under [high churn rate](https://docs.victoriametrics.com/FAQ.html#what-is-high-churn-rate). This could result in increasing CPU load over time.
|
* BUGFIX: `vmstorage`: properly expire `storage/prefetchedMetricIDs` cache. Previously this cache was never expired, so it could grow big under [high churn rate](https://docs.victoriametrics.com/FAQ.html#what-is-high-churn-rate). This could result in increasing CPU load over time.
|
||||||
* BUGFIX: [vmalert](https://docs.victoriametrics.com/vmalert.html): check `-external.url` schema when starting vmalert, must be `http` or `https`. Before, alertmanager could reject alert notifications if `-external.url` contained no or wrong schema.
|
* BUGFIX: [vmalert](https://docs.victoriametrics.com/vmalert.html): check `-external.url` schema when starting vmalert, must be `http` or `https`. Before, alertmanager could reject alert notifications if `-external.url` contained no or wrong schema.
|
||||||
* BUGFIX: [vmalert](https://docs.victoriametrics.com/vmalert.html): automatically add `exported_` prefix for original evaluation result label if it's conflicted with external or reserved one, previously it was overridden. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5161).
|
* BUGFIX: [vmalert](https://docs.victoriametrics.com/vmalert.html): automatically add `exported_` prefix for original evaluation result label if it's conflicted with external or reserved one, previously it was overridden. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5161).
|
||||||
|
* BUGFIX: [vmalert](https://docs.victoriametrics.com/vmalert.html): autogenerate `ALERTS_FOR_STATE` time series for alerting rules with `for: 0`. Previously, `ALERTS_FOR_STATE` was generated only for alerts with `for > 0`. The change aligns with Prometheus behavior. See more details in [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5648).
|
||||||
* BUGFIX: [MetricsQL](https://docs.victoriametrics.com/MetricsQL.html): consistently sort results for `q1 or q2` query, so they do not change colors with each refresh in Grafana. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5393).
|
* BUGFIX: [MetricsQL](https://docs.victoriametrics.com/MetricsQL.html): consistently sort results for `q1 or q2` query, so they do not change colors with each refresh in Grafana. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5393).
|
||||||
* BUGFIX: [MetricsQL](https://docs.victoriametrics.com/MetricsQL.html): properly return results from [bottomk](https://docs.victoriametrics.com/MetricsQL.html#bottomk) and `bottomk_*()` functions when some of these results contain NaN values. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5506). Thanks to @xiaozongyang for [the fix](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5509).
|
* BUGFIX: [MetricsQL](https://docs.victoriametrics.com/MetricsQL.html): properly return results from [bottomk](https://docs.victoriametrics.com/MetricsQL.html#bottomk) and `bottomk_*()` functions when some of these results contain NaN values. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5506). Thanks to @xiaozongyang for [the fix](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5509).
|
||||||
* BUGFIX: [MetricsQL](https://docs.victoriametrics.com/MetricsQL.html): properly handle queries, which wrap [rollup functions](https://docs.victoriametrics.com/MetricsQL.html#rollup-functions) with multiple arguments without explicitly specified lookbehind window in square brackets into [aggregate functions](https://docs.victoriametrics.com/MetricsQL.html#aggregate-functions). For example, `sum(quantile_over_time(0.5, process_resident_memory_bytes))` was resulting to `expecting at least 2 args to ...; got 1 args` error. Thanks to @atykhyy for [the pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5414).
|
* BUGFIX: [MetricsQL](https://docs.victoriametrics.com/MetricsQL.html): properly handle queries, which wrap [rollup functions](https://docs.victoriametrics.com/MetricsQL.html#rollup-functions) with multiple arguments without explicitly specified lookbehind window in square brackets into [aggregate functions](https://docs.victoriametrics.com/MetricsQL.html#aggregate-functions). For example, `sum(quantile_over_time(0.5, process_resident_memory_bytes))` was resulting to `expecting at least 2 args to ...; got 1 args` error. Thanks to @atykhyy for [the pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5414).
|
||||||
|
Loading…
Reference in New Issue
Block a user