2023-10-13 13:54:33 +02:00
|
|
|
package rule
|
2020-06-01 12:46:37 +02:00
|
|
|
|
|
|
|
import (
|
|
|
|
"context"
|
|
|
|
"fmt"
|
|
|
|
"hash/fnv"
|
|
|
|
"sort"
|
2022-09-13 15:25:43 +02:00
|
|
|
"strings"
|
2020-06-01 12:46:37 +02:00
|
|
|
"sync"
|
|
|
|
"time"
|
|
|
|
|
|
|
|
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/config"
|
|
|
|
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
|
|
|
|
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
|
2022-05-14 11:38:44 +02:00
|
|
|
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/templates"
|
2022-02-02 13:11:41 +01:00
|
|
|
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/utils"
|
2020-06-01 12:46:37 +02:00
|
|
|
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
|
|
|
|
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
|
|
|
|
)
|
|
|
|
|
|
|
|
// AlertingRule is basic alert entity
|
|
|
|
type AlertingRule struct {
|
2023-07-27 15:13:13 +02:00
|
|
|
Type config.Type
|
|
|
|
RuleID uint64
|
|
|
|
Name string
|
|
|
|
Expr string
|
|
|
|
For time.Duration
|
|
|
|
KeepFiringFor time.Duration
|
|
|
|
Labels map[string]string
|
|
|
|
Annotations map[string]string
|
|
|
|
GroupID uint64
|
|
|
|
GroupName string
|
|
|
|
EvalInterval time.Duration
|
|
|
|
Debug bool
|
2020-06-01 12:46:37 +02:00
|
|
|
|
2021-04-28 22:41:15 +02:00
|
|
|
q datasource.Querier
|
|
|
|
|
2022-09-14 14:04:24 +02:00
|
|
|
alertsMu sync.RWMutex
|
2020-06-01 12:46:37 +02:00
|
|
|
// stores list of active alerts
|
|
|
|
alerts map[uint64]*notifier.Alert
|
2022-09-14 14:04:24 +02:00
|
|
|
|
|
|
|
// state stores recent state changes
|
|
|
|
// during evaluations
|
|
|
|
state *ruleState
|
app/vmalert: extend metrics set exported by `vmalert` #573 (#654)
* app/vmalert: extend metrics set exported by `vmalert` #573
New metrics were added to improve observability:
+ vmalert_alerts_pending{alertname, group} - number of pending alerts per group
per alert;
+ vmalert_alerts_acitve{alertname, group} - number of active alerts per group
per alert;
+ vmalert_alerts_error{alertname, group} - is 1 if alertname ended up with error
during prev execution, is 0 if no errors happened;
+ vmalert_recording_rules_error{recording, group} - is 1 if recording rule
ended up with error during prev execution, is 0 if no errors happened;
* vmalert_iteration_total{group, file} - now contains group and file name labels.
This should improve control over specific groups;
* vmalert_iteration_duration_seconds{group, file} - now contains group and file name labels. This should improve control over specific groups;
Some collisions for alerts and recording rules are possible, because neither
group name nor alert/recording rule name are unique for compatibility reasons.
Commit contains list of TODOs for Unregistering metrics since groups and rules
are ephemeral and could be removed without application restart. In order to
unlock Unregistering feature corresponding PR was filed - https://github.com/VictoriaMetrics/metrics/pull/13
* app/vmalert: extend metrics set exported by `vmalert` #573
The changes are following:
* add an ID label to rules metrics, since `name` collisions within one group is
a common case - see the k8s example alerts;
* supports metrics unregistering on rule updates. Consider the case when one rule
was added or removed from the group, or the whole group was added or removed.
The change depends on https://github.com/VictoriaMetrics/metrics/pull/16
where race condition for Unregister method was fixed.
2020-08-09 08:41:29 +02:00
|
|
|
|
|
|
|
metrics *alertingRuleMetrics
|
|
|
|
}
|
|
|
|
|
|
|
|
type alertingRuleMetrics struct {
|
2023-05-08 09:36:39 +02:00
|
|
|
errors *utils.Gauge
|
|
|
|
pending *utils.Gauge
|
|
|
|
active *utils.Gauge
|
|
|
|
samples *utils.Gauge
|
|
|
|
seriesFetched *utils.Gauge
|
2020-06-01 12:46:37 +02:00
|
|
|
}
|
|
|
|
|
2023-10-13 13:54:33 +02:00
|
|
|
// NewAlertingRule creates a new AlertingRule
|
|
|
|
func NewAlertingRule(qb datasource.QuerierBuilder, group *Group, cfg config.Rule) *AlertingRule {
|
app/vmalert: extend metrics set exported by `vmalert` #573 (#654)
* app/vmalert: extend metrics set exported by `vmalert` #573
New metrics were added to improve observability:
+ vmalert_alerts_pending{alertname, group} - number of pending alerts per group
per alert;
+ vmalert_alerts_acitve{alertname, group} - number of active alerts per group
per alert;
+ vmalert_alerts_error{alertname, group} - is 1 if alertname ended up with error
during prev execution, is 0 if no errors happened;
+ vmalert_recording_rules_error{recording, group} - is 1 if recording rule
ended up with error during prev execution, is 0 if no errors happened;
* vmalert_iteration_total{group, file} - now contains group and file name labels.
This should improve control over specific groups;
* vmalert_iteration_duration_seconds{group, file} - now contains group and file name labels. This should improve control over specific groups;
Some collisions for alerts and recording rules are possible, because neither
group name nor alert/recording rule name are unique for compatibility reasons.
Commit contains list of TODOs for Unregistering metrics since groups and rules
are ephemeral and could be removed without application restart. In order to
unlock Unregistering feature corresponding PR was filed - https://github.com/VictoriaMetrics/metrics/pull/13
* app/vmalert: extend metrics set exported by `vmalert` #573
The changes are following:
* add an ID label to rules metrics, since `name` collisions within one group is
a common case - see the k8s example alerts;
* supports metrics unregistering on rule updates. Consider the case when one rule
was added or removed from the group, or the whole group was added or removed.
The change depends on https://github.com/VictoriaMetrics/metrics/pull/16
where race condition for Unregister method was fixed.
2020-08-09 08:41:29 +02:00
|
|
|
ar := &AlertingRule{
|
2023-07-27 15:13:13 +02:00
|
|
|
Type: group.Type,
|
|
|
|
RuleID: cfg.ID,
|
|
|
|
Name: cfg.Alert,
|
|
|
|
Expr: cfg.Expr,
|
|
|
|
For: cfg.For.Duration(),
|
|
|
|
KeepFiringFor: cfg.KeepFiringFor.Duration(),
|
|
|
|
Labels: cfg.Labels,
|
|
|
|
Annotations: cfg.Annotations,
|
|
|
|
GroupID: group.ID(),
|
|
|
|
GroupName: group.Name,
|
|
|
|
EvalInterval: group.Interval,
|
|
|
|
Debug: cfg.Debug,
|
2021-04-30 08:46:03 +02:00
|
|
|
q: qb.BuildWithParams(datasource.QuerierParams{
|
2022-07-22 10:44:55 +02:00
|
|
|
DataSourceType: group.Type.String(),
|
2021-04-30 08:46:03 +02:00
|
|
|
EvaluationInterval: group.Interval,
|
2021-12-02 13:45:08 +01:00
|
|
|
QueryParams: group.Params,
|
2022-07-21 15:59:55 +02:00
|
|
|
Headers: group.Headers,
|
2022-09-13 15:25:43 +02:00
|
|
|
Debug: cfg.Debug,
|
2021-04-30 08:46:03 +02:00
|
|
|
}),
|
|
|
|
alerts: make(map[uint64]*notifier.Alert),
|
|
|
|
metrics: &alertingRuleMetrics{},
|
2020-06-01 12:46:37 +02:00
|
|
|
}
|
app/vmalert: extend metrics set exported by `vmalert` #573 (#654)
* app/vmalert: extend metrics set exported by `vmalert` #573
New metrics were added to improve observability:
+ vmalert_alerts_pending{alertname, group} - number of pending alerts per group
per alert;
+ vmalert_alerts_acitve{alertname, group} - number of active alerts per group
per alert;
+ vmalert_alerts_error{alertname, group} - is 1 if alertname ended up with error
during prev execution, is 0 if no errors happened;
+ vmalert_recording_rules_error{recording, group} - is 1 if recording rule
ended up with error during prev execution, is 0 if no errors happened;
* vmalert_iteration_total{group, file} - now contains group and file name labels.
This should improve control over specific groups;
* vmalert_iteration_duration_seconds{group, file} - now contains group and file name labels. This should improve control over specific groups;
Some collisions for alerts and recording rules are possible, because neither
group name nor alert/recording rule name are unique for compatibility reasons.
Commit contains list of TODOs for Unregistering metrics since groups and rules
are ephemeral and could be removed without application restart. In order to
unlock Unregistering feature corresponding PR was filed - https://github.com/VictoriaMetrics/metrics/pull/13
* app/vmalert: extend metrics set exported by `vmalert` #573
The changes are following:
* add an ID label to rules metrics, since `name` collisions within one group is
a common case - see the k8s example alerts;
* supports metrics unregistering on rule updates. Consider the case when one rule
was added or removed from the group, or the whole group was added or removed.
The change depends on https://github.com/VictoriaMetrics/metrics/pull/16
where race condition for Unregister method was fixed.
2020-08-09 08:41:29 +02:00
|
|
|
|
2023-10-13 13:54:33 +02:00
|
|
|
entrySize := *ruleUpdateEntriesLimit
|
2022-12-29 12:36:44 +01:00
|
|
|
if cfg.UpdateEntriesLimit != nil {
|
2023-10-13 13:54:33 +02:00
|
|
|
entrySize = *cfg.UpdateEntriesLimit
|
|
|
|
}
|
|
|
|
if entrySize < 1 {
|
|
|
|
entrySize = 1
|
|
|
|
}
|
|
|
|
ar.state = &ruleState{
|
|
|
|
entries: make([]StateEntry, entrySize),
|
2022-12-29 12:36:44 +01:00
|
|
|
}
|
|
|
|
|
app/vmalert: extend metrics set exported by `vmalert` #573 (#654)
* app/vmalert: extend metrics set exported by `vmalert` #573
New metrics were added to improve observability:
+ vmalert_alerts_pending{alertname, group} - number of pending alerts per group
per alert;
+ vmalert_alerts_acitve{alertname, group} - number of active alerts per group
per alert;
+ vmalert_alerts_error{alertname, group} - is 1 if alertname ended up with error
during prev execution, is 0 if no errors happened;
+ vmalert_recording_rules_error{recording, group} - is 1 if recording rule
ended up with error during prev execution, is 0 if no errors happened;
* vmalert_iteration_total{group, file} - now contains group and file name labels.
This should improve control over specific groups;
* vmalert_iteration_duration_seconds{group, file} - now contains group and file name labels. This should improve control over specific groups;
Some collisions for alerts and recording rules are possible, because neither
group name nor alert/recording rule name are unique for compatibility reasons.
Commit contains list of TODOs for Unregistering metrics since groups and rules
are ephemeral and could be removed without application restart. In order to
unlock Unregistering feature corresponding PR was filed - https://github.com/VictoriaMetrics/metrics/pull/13
* app/vmalert: extend metrics set exported by `vmalert` #573
The changes are following:
* add an ID label to rules metrics, since `name` collisions within one group is
a common case - see the k8s example alerts;
* supports metrics unregistering on rule updates. Consider the case when one rule
was added or removed from the group, or the whole group was added or removed.
The change depends on https://github.com/VictoriaMetrics/metrics/pull/16
where race condition for Unregister method was fixed.
2020-08-09 08:41:29 +02:00
|
|
|
labels := fmt.Sprintf(`alertname=%q, group=%q, id="%d"`, ar.Name, group.Name, ar.ID())
|
2022-02-02 13:11:41 +01:00
|
|
|
ar.metrics.pending = utils.GetOrCreateGauge(fmt.Sprintf(`vmalert_alerts_pending{%s}`, labels),
|
app/vmalert: extend metrics set exported by `vmalert` #573 (#654)
* app/vmalert: extend metrics set exported by `vmalert` #573
New metrics were added to improve observability:
+ vmalert_alerts_pending{alertname, group} - number of pending alerts per group
per alert;
+ vmalert_alerts_acitve{alertname, group} - number of active alerts per group
per alert;
+ vmalert_alerts_error{alertname, group} - is 1 if alertname ended up with error
during prev execution, is 0 if no errors happened;
+ vmalert_recording_rules_error{recording, group} - is 1 if recording rule
ended up with error during prev execution, is 0 if no errors happened;
* vmalert_iteration_total{group, file} - now contains group and file name labels.
This should improve control over specific groups;
* vmalert_iteration_duration_seconds{group, file} - now contains group and file name labels. This should improve control over specific groups;
Some collisions for alerts and recording rules are possible, because neither
group name nor alert/recording rule name are unique for compatibility reasons.
Commit contains list of TODOs for Unregistering metrics since groups and rules
are ephemeral and could be removed without application restart. In order to
unlock Unregistering feature corresponding PR was filed - https://github.com/VictoriaMetrics/metrics/pull/13
* app/vmalert: extend metrics set exported by `vmalert` #573
The changes are following:
* add an ID label to rules metrics, since `name` collisions within one group is
a common case - see the k8s example alerts;
* supports metrics unregistering on rule updates. Consider the case when one rule
was added or removed from the group, or the whole group was added or removed.
The change depends on https://github.com/VictoriaMetrics/metrics/pull/16
where race condition for Unregister method was fixed.
2020-08-09 08:41:29 +02:00
|
|
|
func() float64 {
|
2022-09-14 14:04:24 +02:00
|
|
|
ar.alertsMu.RLock()
|
|
|
|
defer ar.alertsMu.RUnlock()
|
app/vmalert: extend metrics set exported by `vmalert` #573 (#654)
* app/vmalert: extend metrics set exported by `vmalert` #573
New metrics were added to improve observability:
+ vmalert_alerts_pending{alertname, group} - number of pending alerts per group
per alert;
+ vmalert_alerts_acitve{alertname, group} - number of active alerts per group
per alert;
+ vmalert_alerts_error{alertname, group} - is 1 if alertname ended up with error
during prev execution, is 0 if no errors happened;
+ vmalert_recording_rules_error{recording, group} - is 1 if recording rule
ended up with error during prev execution, is 0 if no errors happened;
* vmalert_iteration_total{group, file} - now contains group and file name labels.
This should improve control over specific groups;
* vmalert_iteration_duration_seconds{group, file} - now contains group and file name labels. This should improve control over specific groups;
Some collisions for alerts and recording rules are possible, because neither
group name nor alert/recording rule name are unique for compatibility reasons.
Commit contains list of TODOs for Unregistering metrics since groups and rules
are ephemeral and could be removed without application restart. In order to
unlock Unregistering feature corresponding PR was filed - https://github.com/VictoriaMetrics/metrics/pull/13
* app/vmalert: extend metrics set exported by `vmalert` #573
The changes are following:
* add an ID label to rules metrics, since `name` collisions within one group is
a common case - see the k8s example alerts;
* supports metrics unregistering on rule updates. Consider the case when one rule
was added or removed from the group, or the whole group was added or removed.
The change depends on https://github.com/VictoriaMetrics/metrics/pull/16
where race condition for Unregister method was fixed.
2020-08-09 08:41:29 +02:00
|
|
|
var num int
|
|
|
|
for _, a := range ar.alerts {
|
|
|
|
if a.State == notifier.StatePending {
|
|
|
|
num++
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return float64(num)
|
|
|
|
})
|
2022-02-02 13:11:41 +01:00
|
|
|
ar.metrics.active = utils.GetOrCreateGauge(fmt.Sprintf(`vmalert_alerts_firing{%s}`, labels),
|
app/vmalert: extend metrics set exported by `vmalert` #573 (#654)
* app/vmalert: extend metrics set exported by `vmalert` #573
New metrics were added to improve observability:
+ vmalert_alerts_pending{alertname, group} - number of pending alerts per group
per alert;
+ vmalert_alerts_acitve{alertname, group} - number of active alerts per group
per alert;
+ vmalert_alerts_error{alertname, group} - is 1 if alertname ended up with error
during prev execution, is 0 if no errors happened;
+ vmalert_recording_rules_error{recording, group} - is 1 if recording rule
ended up with error during prev execution, is 0 if no errors happened;
* vmalert_iteration_total{group, file} - now contains group and file name labels.
This should improve control over specific groups;
* vmalert_iteration_duration_seconds{group, file} - now contains group and file name labels. This should improve control over specific groups;
Some collisions for alerts and recording rules are possible, because neither
group name nor alert/recording rule name are unique for compatibility reasons.
Commit contains list of TODOs for Unregistering metrics since groups and rules
are ephemeral and could be removed without application restart. In order to
unlock Unregistering feature corresponding PR was filed - https://github.com/VictoriaMetrics/metrics/pull/13
* app/vmalert: extend metrics set exported by `vmalert` #573
The changes are following:
* add an ID label to rules metrics, since `name` collisions within one group is
a common case - see the k8s example alerts;
* supports metrics unregistering on rule updates. Consider the case when one rule
was added or removed from the group, or the whole group was added or removed.
The change depends on https://github.com/VictoriaMetrics/metrics/pull/16
where race condition for Unregister method was fixed.
2020-08-09 08:41:29 +02:00
|
|
|
func() float64 {
|
2022-09-14 14:04:24 +02:00
|
|
|
ar.alertsMu.RLock()
|
|
|
|
defer ar.alertsMu.RUnlock()
|
app/vmalert: extend metrics set exported by `vmalert` #573 (#654)
* app/vmalert: extend metrics set exported by `vmalert` #573
New metrics were added to improve observability:
+ vmalert_alerts_pending{alertname, group} - number of pending alerts per group
per alert;
+ vmalert_alerts_acitve{alertname, group} - number of active alerts per group
per alert;
+ vmalert_alerts_error{alertname, group} - is 1 if alertname ended up with error
during prev execution, is 0 if no errors happened;
+ vmalert_recording_rules_error{recording, group} - is 1 if recording rule
ended up with error during prev execution, is 0 if no errors happened;
* vmalert_iteration_total{group, file} - now contains group and file name labels.
This should improve control over specific groups;
* vmalert_iteration_duration_seconds{group, file} - now contains group and file name labels. This should improve control over specific groups;
Some collisions for alerts and recording rules are possible, because neither
group name nor alert/recording rule name are unique for compatibility reasons.
Commit contains list of TODOs for Unregistering metrics since groups and rules
are ephemeral and could be removed without application restart. In order to
unlock Unregistering feature corresponding PR was filed - https://github.com/VictoriaMetrics/metrics/pull/13
* app/vmalert: extend metrics set exported by `vmalert` #573
The changes are following:
* add an ID label to rules metrics, since `name` collisions within one group is
a common case - see the k8s example alerts;
* supports metrics unregistering on rule updates. Consider the case when one rule
was added or removed from the group, or the whole group was added or removed.
The change depends on https://github.com/VictoriaMetrics/metrics/pull/16
where race condition for Unregister method was fixed.
2020-08-09 08:41:29 +02:00
|
|
|
var num int
|
|
|
|
for _, a := range ar.alerts {
|
|
|
|
if a.State == notifier.StateFiring {
|
|
|
|
num++
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return float64(num)
|
|
|
|
})
|
2022-02-02 13:11:41 +01:00
|
|
|
ar.metrics.errors = utils.GetOrCreateGauge(fmt.Sprintf(`vmalert_alerting_rules_error{%s}`, labels),
|
app/vmalert: extend metrics set exported by `vmalert` #573 (#654)
* app/vmalert: extend metrics set exported by `vmalert` #573
New metrics were added to improve observability:
+ vmalert_alerts_pending{alertname, group} - number of pending alerts per group
per alert;
+ vmalert_alerts_acitve{alertname, group} - number of active alerts per group
per alert;
+ vmalert_alerts_error{alertname, group} - is 1 if alertname ended up with error
during prev execution, is 0 if no errors happened;
+ vmalert_recording_rules_error{recording, group} - is 1 if recording rule
ended up with error during prev execution, is 0 if no errors happened;
* vmalert_iteration_total{group, file} - now contains group and file name labels.
This should improve control over specific groups;
* vmalert_iteration_duration_seconds{group, file} - now contains group and file name labels. This should improve control over specific groups;
Some collisions for alerts and recording rules are possible, because neither
group name nor alert/recording rule name are unique for compatibility reasons.
Commit contains list of TODOs for Unregistering metrics since groups and rules
are ephemeral and could be removed without application restart. In order to
unlock Unregistering feature corresponding PR was filed - https://github.com/VictoriaMetrics/metrics/pull/13
* app/vmalert: extend metrics set exported by `vmalert` #573
The changes are following:
* add an ID label to rules metrics, since `name` collisions within one group is
a common case - see the k8s example alerts;
* supports metrics unregistering on rule updates. Consider the case when one rule
was added or removed from the group, or the whole group was added or removed.
The change depends on https://github.com/VictoriaMetrics/metrics/pull/16
where race condition for Unregister method was fixed.
2020-08-09 08:41:29 +02:00
|
|
|
func() float64 {
|
2022-09-14 14:04:24 +02:00
|
|
|
e := ar.state.getLast()
|
2023-10-13 13:54:33 +02:00
|
|
|
if e.Err == nil {
|
app/vmalert: extend metrics set exported by `vmalert` #573 (#654)
* app/vmalert: extend metrics set exported by `vmalert` #573
New metrics were added to improve observability:
+ vmalert_alerts_pending{alertname, group} - number of pending alerts per group
per alert;
+ vmalert_alerts_acitve{alertname, group} - number of active alerts per group
per alert;
+ vmalert_alerts_error{alertname, group} - is 1 if alertname ended up with error
during prev execution, is 0 if no errors happened;
+ vmalert_recording_rules_error{recording, group} - is 1 if recording rule
ended up with error during prev execution, is 0 if no errors happened;
* vmalert_iteration_total{group, file} - now contains group and file name labels.
This should improve control over specific groups;
* vmalert_iteration_duration_seconds{group, file} - now contains group and file name labels. This should improve control over specific groups;
Some collisions for alerts and recording rules are possible, because neither
group name nor alert/recording rule name are unique for compatibility reasons.
Commit contains list of TODOs for Unregistering metrics since groups and rules
are ephemeral and could be removed without application restart. In order to
unlock Unregistering feature corresponding PR was filed - https://github.com/VictoriaMetrics/metrics/pull/13
* app/vmalert: extend metrics set exported by `vmalert` #573
The changes are following:
* add an ID label to rules metrics, since `name` collisions within one group is
a common case - see the k8s example alerts;
* supports metrics unregistering on rule updates. Consider the case when one rule
was added or removed from the group, or the whole group was added or removed.
The change depends on https://github.com/VictoriaMetrics/metrics/pull/16
where race condition for Unregister method was fixed.
2020-08-09 08:41:29 +02:00
|
|
|
return 0
|
|
|
|
}
|
|
|
|
return 1
|
|
|
|
})
|
2022-02-02 13:11:41 +01:00
|
|
|
ar.metrics.samples = utils.GetOrCreateGauge(fmt.Sprintf(`vmalert_alerting_rules_last_evaluation_samples{%s}`, labels),
|
2021-08-05 08:59:46 +02:00
|
|
|
func() float64 {
|
2022-09-14 14:04:24 +02:00
|
|
|
e := ar.state.getLast()
|
2023-10-13 13:54:33 +02:00
|
|
|
return float64(e.Samples)
|
2021-08-05 08:59:46 +02:00
|
|
|
})
|
2023-05-08 09:36:39 +02:00
|
|
|
ar.metrics.seriesFetched = utils.GetOrCreateGauge(fmt.Sprintf(`vmalert_alerting_rules_last_evaluation_series_fetched{%s}`, labels),
|
|
|
|
func() float64 {
|
|
|
|
e := ar.state.getLast()
|
2023-10-13 13:54:33 +02:00
|
|
|
if e.SeriesFetched == nil {
|
2023-05-08 09:36:39 +02:00
|
|
|
// means seriesFetched is unsupported
|
|
|
|
return -1
|
|
|
|
}
|
2023-10-13 13:54:33 +02:00
|
|
|
seriesFetched := float64(*e.SeriesFetched)
|
|
|
|
if seriesFetched == 0 && e.Samples > 0 {
|
2023-05-10 15:04:05 +02:00
|
|
|
// `alert: 0.95` will fetch no series
|
|
|
|
// but will get one time series in response.
|
2023-10-13 13:54:33 +02:00
|
|
|
seriesFetched = float64(e.Samples)
|
2023-05-10 15:04:05 +02:00
|
|
|
}
|
|
|
|
return seriesFetched
|
2023-05-08 09:36:39 +02:00
|
|
|
})
|
app/vmalert: extend metrics set exported by `vmalert` #573 (#654)
* app/vmalert: extend metrics set exported by `vmalert` #573
New metrics were added to improve observability:
+ vmalert_alerts_pending{alertname, group} - number of pending alerts per group
per alert;
+ vmalert_alerts_acitve{alertname, group} - number of active alerts per group
per alert;
+ vmalert_alerts_error{alertname, group} - is 1 if alertname ended up with error
during prev execution, is 0 if no errors happened;
+ vmalert_recording_rules_error{recording, group} - is 1 if recording rule
ended up with error during prev execution, is 0 if no errors happened;
* vmalert_iteration_total{group, file} - now contains group and file name labels.
This should improve control over specific groups;
* vmalert_iteration_duration_seconds{group, file} - now contains group and file name labels. This should improve control over specific groups;
Some collisions for alerts and recording rules are possible, because neither
group name nor alert/recording rule name are unique for compatibility reasons.
Commit contains list of TODOs for Unregistering metrics since groups and rules
are ephemeral and could be removed without application restart. In order to
unlock Unregistering feature corresponding PR was filed - https://github.com/VictoriaMetrics/metrics/pull/13
* app/vmalert: extend metrics set exported by `vmalert` #573
The changes are following:
* add an ID label to rules metrics, since `name` collisions within one group is
a common case - see the k8s example alerts;
* supports metrics unregistering on rule updates. Consider the case when one rule
was added or removed from the group, or the whole group was added or removed.
The change depends on https://github.com/VictoriaMetrics/metrics/pull/16
where race condition for Unregister method was fixed.
2020-08-09 08:41:29 +02:00
|
|
|
return ar
|
|
|
|
}
|
|
|
|
|
2023-10-13 13:54:33 +02:00
|
|
|
// close unregisters rule metrics
|
|
|
|
func (ar *AlertingRule) close() {
|
2022-02-02 13:11:41 +01:00
|
|
|
ar.metrics.active.Unregister()
|
|
|
|
ar.metrics.pending.Unregister()
|
|
|
|
ar.metrics.errors.Unregister()
|
|
|
|
ar.metrics.samples.Unregister()
|
2023-05-08 09:36:39 +02:00
|
|
|
ar.metrics.seriesFetched.Unregister()
|
2020-06-01 12:46:37 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
// String implements Stringer interface
|
|
|
|
func (ar *AlertingRule) String() string {
|
|
|
|
return ar.Name
|
|
|
|
}
|
|
|
|
|
|
|
|
// ID returns unique Rule ID
|
|
|
|
// within the parent Group.
|
|
|
|
func (ar *AlertingRule) ID() uint64 {
|
2020-06-15 21:15:47 +02:00
|
|
|
return ar.RuleID
|
2020-06-01 12:46:37 +02:00
|
|
|
}
|
|
|
|
|
2023-10-13 13:54:33 +02:00
|
|
|
// GetAlerts returns active alerts of rule
|
|
|
|
func (ar *AlertingRule) GetAlerts() []*notifier.Alert {
|
|
|
|
ar.alertsMu.RLock()
|
|
|
|
defer ar.alertsMu.RUnlock()
|
|
|
|
var alerts []*notifier.Alert
|
|
|
|
for _, a := range ar.alerts {
|
|
|
|
alerts = append(alerts, a)
|
|
|
|
}
|
|
|
|
return alerts
|
|
|
|
}
|
|
|
|
|
|
|
|
// GetAlert returns alert if id exists
|
|
|
|
func (ar *AlertingRule) GetAlert(id uint64) *notifier.Alert {
|
|
|
|
ar.alertsMu.RLock()
|
|
|
|
defer ar.alertsMu.RUnlock()
|
|
|
|
if ar.alerts == nil {
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
return ar.alerts[id]
|
|
|
|
}
|
|
|
|
|
2022-09-13 15:33:00 +02:00
|
|
|
func (ar *AlertingRule) logDebugf(at time.Time, a *notifier.Alert, format string, args ...interface{}) {
|
2022-09-13 15:25:43 +02:00
|
|
|
if !ar.Debug {
|
|
|
|
return
|
|
|
|
}
|
|
|
|
prefix := fmt.Sprintf("DEBUG rule %q:%q (%d) at %v: ",
|
|
|
|
ar.GroupName, ar.Name, ar.RuleID, at.Format(time.RFC3339))
|
|
|
|
|
|
|
|
if a != nil {
|
|
|
|
labelKeys := make([]string, len(a.Labels))
|
|
|
|
var i int
|
|
|
|
for k := range a.Labels {
|
|
|
|
labelKeys[i] = k
|
|
|
|
i++
|
|
|
|
}
|
|
|
|
sort.Strings(labelKeys)
|
2022-09-13 15:33:00 +02:00
|
|
|
labels := make([]string, len(labelKeys))
|
|
|
|
for i, l := range labelKeys {
|
|
|
|
labels[i] = fmt.Sprintf("%s=%q", l, a.Labels[l])
|
2022-09-13 15:25:43 +02:00
|
|
|
}
|
2022-09-13 15:33:00 +02:00
|
|
|
labelsStr := strings.Join(labels, ",")
|
|
|
|
prefix += fmt.Sprintf("alert %d {%s} ", a.ID, labelsStr)
|
2022-09-13 15:25:43 +02:00
|
|
|
}
|
2022-09-13 15:33:00 +02:00
|
|
|
msg := fmt.Sprintf(format, args...)
|
2022-09-13 15:25:43 +02:00
|
|
|
logger.Infof("%s", prefix+msg)
|
|
|
|
}
|
|
|
|
|
2023-10-13 13:54:33 +02:00
|
|
|
// updateWith copies all significant fields.
|
|
|
|
// alerts state isn't copied since
|
|
|
|
// it should be updated in next 2 Execs
|
|
|
|
func (ar *AlertingRule) updateWith(r Rule) error {
|
|
|
|
nr, ok := r.(*AlertingRule)
|
|
|
|
if !ok {
|
|
|
|
return fmt.Errorf("BUG: attempt to update alerting rule with wrong type %#v", r)
|
|
|
|
}
|
|
|
|
ar.Expr = nr.Expr
|
|
|
|
ar.For = nr.For
|
|
|
|
ar.KeepFiringFor = nr.KeepFiringFor
|
|
|
|
ar.Labels = nr.Labels
|
|
|
|
ar.Annotations = nr.Annotations
|
|
|
|
ar.EvalInterval = nr.EvalInterval
|
|
|
|
ar.Debug = nr.Debug
|
|
|
|
ar.q = nr.q
|
|
|
|
ar.state = nr.state
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
vmalert: fix labels and annotations processing for alerts (#2403)
To improve compatibility with Prometheus alerting the order of
templates processing has changed.
Before, vmalert did all labels processing beforehand. It meant
all extra labels (such as `alertname`, `alertgroup` or rule labels)
were available in templating. All collisions were resolved in favour
of extra labels.
In Prometheus, only labels from the received metric are available in
templating, so no collisions are possible.
This change makes vmalert's behaviour similar to Prometheus.
For example, consider alerting rule which is triggered by time series
with `alertname` label. In vmalert, this label would be overriden
by alerting rule's name everywhere: for alert labels, for annotations, etc.
In Prometheus, it would be overriden for alert's labels only, but in annotations
the original label value would be available.
See more details here https://github.com/prometheus/compliance/issues/80
Signed-off-by: hagen1778 <roman@victoriametrics.com>
2022-04-06 20:24:45 +02:00
|
|
|
type labelSet struct {
|
2022-09-29 18:22:50 +02:00
|
|
|
// origin labels extracted from received time series
|
|
|
|
// plus extra labels (group labels, service labels like alertNameLabel).
|
|
|
|
// in case of conflicts, origin labels from time series preferred.
|
|
|
|
// used for templating annotations
|
vmalert: fix labels and annotations processing for alerts (#2403)
To improve compatibility with Prometheus alerting the order of
templates processing has changed.
Before, vmalert did all labels processing beforehand. It meant
all extra labels (such as `alertname`, `alertgroup` or rule labels)
were available in templating. All collisions were resolved in favour
of extra labels.
In Prometheus, only labels from the received metric are available in
templating, so no collisions are possible.
This change makes vmalert's behaviour similar to Prometheus.
For example, consider alerting rule which is triggered by time series
with `alertname` label. In vmalert, this label would be overriden
by alerting rule's name everywhere: for alert labels, for annotations, etc.
In Prometheus, it would be overriden for alert's labels only, but in annotations
the original label value would be available.
See more details here https://github.com/prometheus/compliance/issues/80
Signed-off-by: hagen1778 <roman@victoriametrics.com>
2022-04-06 20:24:45 +02:00
|
|
|
origin map[string]string
|
2022-09-29 18:22:50 +02:00
|
|
|
// processed labels includes origin labels
|
|
|
|
// plus extra labels (group labels, service labels like alertNameLabel).
|
|
|
|
// in case of conflicts, extra labels are preferred.
|
|
|
|
// used as labels attached to notifier.Alert and ALERTS series written to remote storage.
|
vmalert: fix labels and annotations processing for alerts (#2403)
To improve compatibility with Prometheus alerting the order of
templates processing has changed.
Before, vmalert did all labels processing beforehand. It meant
all extra labels (such as `alertname`, `alertgroup` or rule labels)
were available in templating. All collisions were resolved in favour
of extra labels.
In Prometheus, only labels from the received metric are available in
templating, so no collisions are possible.
This change makes vmalert's behaviour similar to Prometheus.
For example, consider alerting rule which is triggered by time series
with `alertname` label. In vmalert, this label would be overriden
by alerting rule's name everywhere: for alert labels, for annotations, etc.
In Prometheus, it would be overriden for alert's labels only, but in annotations
the original label value would be available.
See more details here https://github.com/prometheus/compliance/issues/80
Signed-off-by: hagen1778 <roman@victoriametrics.com>
2022-04-06 20:24:45 +02:00
|
|
|
processed map[string]string
|
|
|
|
}
|
|
|
|
|
|
|
|
// toLabels converts labels from given Metric
|
|
|
|
// to labelSet which contains original and processed labels.
|
2022-05-14 11:38:44 +02:00
|
|
|
func (ar *AlertingRule) toLabels(m datasource.Metric, qFn templates.QueryFn) (*labelSet, error) {
|
vmalert: fix labels and annotations processing for alerts (#2403)
To improve compatibility with Prometheus alerting the order of
templates processing has changed.
Before, vmalert did all labels processing beforehand. It meant
all extra labels (such as `alertname`, `alertgroup` or rule labels)
were available in templating. All collisions were resolved in favour
of extra labels.
In Prometheus, only labels from the received metric are available in
templating, so no collisions are possible.
This change makes vmalert's behaviour similar to Prometheus.
For example, consider alerting rule which is triggered by time series
with `alertname` label. In vmalert, this label would be overriden
by alerting rule's name everywhere: for alert labels, for annotations, etc.
In Prometheus, it would be overriden for alert's labels only, but in annotations
the original label value would be available.
See more details here https://github.com/prometheus/compliance/issues/80
Signed-off-by: hagen1778 <roman@victoriametrics.com>
2022-04-06 20:24:45 +02:00
|
|
|
ls := &labelSet{
|
2022-09-29 18:22:50 +02:00
|
|
|
origin: make(map[string]string),
|
vmalert: fix labels and annotations processing for alerts (#2403)
To improve compatibility with Prometheus alerting the order of
templates processing has changed.
Before, vmalert did all labels processing beforehand. It meant
all extra labels (such as `alertname`, `alertgroup` or rule labels)
were available in templating. All collisions were resolved in favour
of extra labels.
In Prometheus, only labels from the received metric are available in
templating, so no collisions are possible.
This change makes vmalert's behaviour similar to Prometheus.
For example, consider alerting rule which is triggered by time series
with `alertname` label. In vmalert, this label would be overriden
by alerting rule's name everywhere: for alert labels, for annotations, etc.
In Prometheus, it would be overriden for alert's labels only, but in annotations
the original label value would be available.
See more details here https://github.com/prometheus/compliance/issues/80
Signed-off-by: hagen1778 <roman@victoriametrics.com>
2022-04-06 20:24:45 +02:00
|
|
|
processed: make(map[string]string),
|
|
|
|
}
|
|
|
|
for _, l := range m.Labels {
|
2022-06-27 09:57:56 +02:00
|
|
|
ls.origin[l.Name] = l.Value
|
vmalert: fix labels and annotations processing for alerts (#2403)
To improve compatibility with Prometheus alerting the order of
templates processing has changed.
Before, vmalert did all labels processing beforehand. It meant
all extra labels (such as `alertname`, `alertgroup` or rule labels)
were available in templating. All collisions were resolved in favour
of extra labels.
In Prometheus, only labels from the received metric are available in
templating, so no collisions are possible.
This change makes vmalert's behaviour similar to Prometheus.
For example, consider alerting rule which is triggered by time series
with `alertname` label. In vmalert, this label would be overriden
by alerting rule's name everywhere: for alert labels, for annotations, etc.
In Prometheus, it would be overriden for alert's labels only, but in annotations
the original label value would be available.
See more details here https://github.com/prometheus/compliance/issues/80
Signed-off-by: hagen1778 <roman@victoriametrics.com>
2022-04-06 20:24:45 +02:00
|
|
|
// drop __name__ to be consistent with Prometheus alerting
|
|
|
|
if l.Name == "__name__" {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
ls.processed[l.Name] = l.Value
|
|
|
|
}
|
|
|
|
|
|
|
|
extraLabels, err := notifier.ExecTemplate(qFn, ar.Labels, notifier.AlertTplData{
|
|
|
|
Labels: ls.origin,
|
|
|
|
Value: m.Values[0],
|
|
|
|
Expr: ar.Expr,
|
|
|
|
})
|
|
|
|
if err != nil {
|
2023-10-25 21:24:01 +02:00
|
|
|
return nil, fmt.Errorf("failed to expand labels: %w", err)
|
vmalert: fix labels and annotations processing for alerts (#2403)
To improve compatibility with Prometheus alerting the order of
templates processing has changed.
Before, vmalert did all labels processing beforehand. It meant
all extra labels (such as `alertname`, `alertgroup` or rule labels)
were available in templating. All collisions were resolved in favour
of extra labels.
In Prometheus, only labels from the received metric are available in
templating, so no collisions are possible.
This change makes vmalert's behaviour similar to Prometheus.
For example, consider alerting rule which is triggered by time series
with `alertname` label. In vmalert, this label would be overriden
by alerting rule's name everywhere: for alert labels, for annotations, etc.
In Prometheus, it would be overriden for alert's labels only, but in annotations
the original label value would be available.
See more details here https://github.com/prometheus/compliance/issues/80
Signed-off-by: hagen1778 <roman@victoriametrics.com>
2022-04-06 20:24:45 +02:00
|
|
|
}
|
|
|
|
for k, v := range extraLabels {
|
|
|
|
ls.processed[k] = v
|
2022-09-29 18:22:50 +02:00
|
|
|
if _, ok := ls.origin[k]; !ok {
|
|
|
|
ls.origin[k] = v
|
|
|
|
}
|
vmalert: fix labels and annotations processing for alerts (#2403)
To improve compatibility with Prometheus alerting the order of
templates processing has changed.
Before, vmalert did all labels processing beforehand. It meant
all extra labels (such as `alertname`, `alertgroup` or rule labels)
were available in templating. All collisions were resolved in favour
of extra labels.
In Prometheus, only labels from the received metric are available in
templating, so no collisions are possible.
This change makes vmalert's behaviour similar to Prometheus.
For example, consider alerting rule which is triggered by time series
with `alertname` label. In vmalert, this label would be overriden
by alerting rule's name everywhere: for alert labels, for annotations, etc.
In Prometheus, it would be overriden for alert's labels only, but in annotations
the original label value would be available.
See more details here https://github.com/prometheus/compliance/issues/80
Signed-off-by: hagen1778 <roman@victoriametrics.com>
2022-04-06 20:24:45 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
// set additional labels to identify group and rule name
|
|
|
|
if ar.Name != "" {
|
|
|
|
ls.processed[alertNameLabel] = ar.Name
|
2022-09-29 18:22:50 +02:00
|
|
|
if _, ok := ls.origin[alertNameLabel]; !ok {
|
|
|
|
ls.origin[alertNameLabel] = ar.Name
|
|
|
|
}
|
vmalert: fix labels and annotations processing for alerts (#2403)
To improve compatibility with Prometheus alerting the order of
templates processing has changed.
Before, vmalert did all labels processing beforehand. It meant
all extra labels (such as `alertname`, `alertgroup` or rule labels)
were available in templating. All collisions were resolved in favour
of extra labels.
In Prometheus, only labels from the received metric are available in
templating, so no collisions are possible.
This change makes vmalert's behaviour similar to Prometheus.
For example, consider alerting rule which is triggered by time series
with `alertname` label. In vmalert, this label would be overriden
by alerting rule's name everywhere: for alert labels, for annotations, etc.
In Prometheus, it would be overriden for alert's labels only, but in annotations
the original label value would be available.
See more details here https://github.com/prometheus/compliance/issues/80
Signed-off-by: hagen1778 <roman@victoriametrics.com>
2022-04-06 20:24:45 +02:00
|
|
|
}
|
|
|
|
if !*disableAlertGroupLabel && ar.GroupName != "" {
|
|
|
|
ls.processed[alertGroupNameLabel] = ar.GroupName
|
2022-09-29 18:22:50 +02:00
|
|
|
if _, ok := ls.origin[alertGroupNameLabel]; !ok {
|
|
|
|
ls.origin[alertGroupNameLabel] = ar.GroupName
|
|
|
|
}
|
vmalert: fix labels and annotations processing for alerts (#2403)
To improve compatibility with Prometheus alerting the order of
templates processing has changed.
Before, vmalert did all labels processing beforehand. It meant
all extra labels (such as `alertname`, `alertgroup` or rule labels)
were available in templating. All collisions were resolved in favour
of extra labels.
In Prometheus, only labels from the received metric are available in
templating, so no collisions are possible.
This change makes vmalert's behaviour similar to Prometheus.
For example, consider alerting rule which is triggered by time series
with `alertname` label. In vmalert, this label would be overriden
by alerting rule's name everywhere: for alert labels, for annotations, etc.
In Prometheus, it would be overriden for alert's labels only, but in annotations
the original label value would be available.
See more details here https://github.com/prometheus/compliance/issues/80
Signed-off-by: hagen1778 <roman@victoriametrics.com>
2022-04-06 20:24:45 +02:00
|
|
|
}
|
|
|
|
return ls, nil
|
|
|
|
}
|
|
|
|
|
2023-10-13 13:54:33 +02:00
|
|
|
// execRange executes alerting rule on the given time range similarly to exec.
|
2021-06-09 11:20:38 +02:00
|
|
|
// It doesn't update internal states of the Rule and meant to be used just
|
|
|
|
// to get time series for backfilling.
|
|
|
|
// It returns ALERT and ALERT_FOR_STATE time series as result.
|
2023-10-13 13:54:33 +02:00
|
|
|
func (ar *AlertingRule) execRange(ctx context.Context, start, end time.Time) ([]prompbmarshal.TimeSeries, error) {
|
2023-05-08 09:36:39 +02:00
|
|
|
res, err := ar.q.QueryRange(ctx, ar.Expr, start, end)
|
2021-06-09 11:20:38 +02:00
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
var result []prompbmarshal.TimeSeries
|
|
|
|
qFn := func(query string) ([]datasource.Metric, error) {
|
|
|
|
return nil, fmt.Errorf("`query` template isn't supported in replay mode")
|
|
|
|
}
|
2023-05-08 09:36:39 +02:00
|
|
|
for _, s := range res.Data {
|
vmalert: fix labels and annotations processing for alerts (#2403)
To improve compatibility with Prometheus alerting the order of
templates processing has changed.
Before, vmalert did all labels processing beforehand. It meant
all extra labels (such as `alertname`, `alertgroup` or rule labels)
were available in templating. All collisions were resolved in favour
of extra labels.
In Prometheus, only labels from the received metric are available in
templating, so no collisions are possible.
This change makes vmalert's behaviour similar to Prometheus.
For example, consider alerting rule which is triggered by time series
with `alertname` label. In vmalert, this label would be overriden
by alerting rule's name everywhere: for alert labels, for annotations, etc.
In Prometheus, it would be overriden for alert's labels only, but in annotations
the original label value would be available.
See more details here https://github.com/prometheus/compliance/issues/80
Signed-off-by: hagen1778 <roman@victoriametrics.com>
2022-04-06 20:24:45 +02:00
|
|
|
a, err := ar.newAlert(s, nil, time.Time{}, qFn) // initial alert
|
2021-06-09 11:20:38 +02:00
|
|
|
if err != nil {
|
2023-10-25 21:24:01 +02:00
|
|
|
return nil, fmt.Errorf("failed to create alert: %w", err)
|
2021-06-09 11:20:38 +02:00
|
|
|
}
|
|
|
|
if ar.For == 0 { // if alert is instant
|
|
|
|
a.State = notifier.StateFiring
|
|
|
|
for i := range s.Values {
|
2022-06-09 08:58:25 +02:00
|
|
|
result = append(result, ar.alertToTimeSeries(a, s.Timestamps[i])...)
|
2021-06-09 11:20:38 +02:00
|
|
|
}
|
|
|
|
continue
|
|
|
|
}
|
2022-06-09 08:58:25 +02:00
|
|
|
|
2021-06-09 11:20:38 +02:00
|
|
|
// if alert with For > 0
|
|
|
|
prevT := time.Time{}
|
|
|
|
for i := range s.Values {
|
|
|
|
at := time.Unix(s.Timestamps[i], 0)
|
|
|
|
if at.Sub(prevT) > ar.EvalInterval {
|
|
|
|
// reset to Pending if there are gaps > EvalInterval between DPs
|
|
|
|
a.State = notifier.StatePending
|
2022-03-29 15:09:07 +02:00
|
|
|
a.ActiveAt = at
|
|
|
|
} else if at.Sub(a.ActiveAt) >= ar.For {
|
2021-06-09 11:20:38 +02:00
|
|
|
a.State = notifier.StateFiring
|
2022-03-29 15:09:07 +02:00
|
|
|
a.Start = at
|
2021-06-09 11:20:38 +02:00
|
|
|
}
|
|
|
|
prevT = at
|
2022-06-09 08:58:25 +02:00
|
|
|
result = append(result, ar.alertToTimeSeries(a, s.Timestamps[i])...)
|
2021-06-09 11:20:38 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
return result, nil
|
|
|
|
}
|
|
|
|
|
2022-03-29 15:09:07 +02:00
|
|
|
// resolvedRetention is the duration for which a resolved alert instance
|
|
|
|
// is kept in memory state and consequently repeatedly sent to the AlertManager.
|
|
|
|
const resolvedRetention = 15 * time.Minute
|
|
|
|
|
2023-10-13 13:54:33 +02:00
|
|
|
// exec executes AlertingRule expression via the given Querier.
|
2020-06-01 12:46:37 +02:00
|
|
|
// Based on the Querier results AlertingRule maintains notifier.Alerts
|
2023-10-13 13:54:33 +02:00
|
|
|
func (ar *AlertingRule) exec(ctx context.Context, ts time.Time, limit int) ([]prompbmarshal.TimeSeries, error) {
|
2022-03-15 12:54:53 +01:00
|
|
|
start := time.Now()
|
2023-05-08 09:36:39 +02:00
|
|
|
res, req, err := ar.q.Query(ctx, ar.Expr, ts)
|
2023-10-13 13:54:33 +02:00
|
|
|
curState := StateEntry{
|
|
|
|
Time: start,
|
|
|
|
At: ts,
|
|
|
|
Duration: time.Since(start),
|
|
|
|
Samples: len(res.Data),
|
|
|
|
SeriesFetched: res.SeriesFetched,
|
|
|
|
Err: err,
|
|
|
|
Curl: requestToCurl(req),
|
2022-09-14 14:04:24 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
defer func() {
|
|
|
|
ar.state.add(curState)
|
|
|
|
}()
|
|
|
|
|
|
|
|
ar.alertsMu.Lock()
|
|
|
|
defer ar.alertsMu.Unlock()
|
2020-06-01 12:46:37 +02:00
|
|
|
|
|
|
|
if err != nil {
|
2020-06-30 21:58:18 +02:00
|
|
|
return nil, fmt.Errorf("failed to execute query %q: %w", ar.Expr, err)
|
2020-06-01 12:46:37 +02:00
|
|
|
}
|
|
|
|
|
2023-10-13 13:54:33 +02:00
|
|
|
ar.logDebugf(ts, nil, "query returned %d samples (elapsed: %s)", curState.Samples, curState.Duration)
|
2022-09-13 15:25:43 +02:00
|
|
|
|
2020-06-01 12:46:37 +02:00
|
|
|
for h, a := range ar.alerts {
|
|
|
|
// cleanup inactive alerts from previous Exec
|
2022-03-29 15:09:07 +02:00
|
|
|
if a.State == notifier.StateInactive && ts.Sub(a.ResolvedAt) > resolvedRetention {
|
2022-09-13 15:33:00 +02:00
|
|
|
ar.logDebugf(ts, a, "deleted as inactive")
|
2020-06-01 12:46:37 +02:00
|
|
|
delete(ar.alerts, h)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-09-15 12:40:22 +02:00
|
|
|
qFn := func(query string) ([]datasource.Metric, error) {
|
|
|
|
res, _, err := ar.q.Query(ctx, query, ts)
|
2023-05-08 09:36:39 +02:00
|
|
|
return res.Data, err
|
2022-09-15 12:40:22 +02:00
|
|
|
}
|
2020-06-01 12:46:37 +02:00
|
|
|
updated := make(map[uint64]struct{})
|
|
|
|
// update list of active alerts
|
2023-05-08 09:36:39 +02:00
|
|
|
for _, m := range res.Data {
|
vmalert: fix labels and annotations processing for alerts (#2403)
To improve compatibility with Prometheus alerting the order of
templates processing has changed.
Before, vmalert did all labels processing beforehand. It meant
all extra labels (such as `alertname`, `alertgroup` or rule labels)
were available in templating. All collisions were resolved in favour
of extra labels.
In Prometheus, only labels from the received metric are available in
templating, so no collisions are possible.
This change makes vmalert's behaviour similar to Prometheus.
For example, consider alerting rule which is triggered by time series
with `alertname` label. In vmalert, this label would be overriden
by alerting rule's name everywhere: for alert labels, for annotations, etc.
In Prometheus, it would be overriden for alert's labels only, but in annotations
the original label value would be available.
See more details here https://github.com/prometheus/compliance/issues/80
Signed-off-by: hagen1778 <roman@victoriametrics.com>
2022-04-06 20:24:45 +02:00
|
|
|
ls, err := ar.toLabels(m, qFn)
|
2020-12-19 13:10:59 +01:00
|
|
|
if err != nil {
|
2023-10-25 21:24:01 +02:00
|
|
|
curState.Err = fmt.Errorf("failed to expand labels: %w", err)
|
2023-10-13 13:54:33 +02:00
|
|
|
return nil, curState.Err
|
2020-12-19 13:10:59 +01:00
|
|
|
}
|
vmalert: fix labels and annotations processing for alerts (#2403)
To improve compatibility with Prometheus alerting the order of
templates processing has changed.
Before, vmalert did all labels processing beforehand. It meant
all extra labels (such as `alertname`, `alertgroup` or rule labels)
were available in templating. All collisions were resolved in favour
of extra labels.
In Prometheus, only labels from the received metric are available in
templating, so no collisions are possible.
This change makes vmalert's behaviour similar to Prometheus.
For example, consider alerting rule which is triggered by time series
with `alertname` label. In vmalert, this label would be overriden
by alerting rule's name everywhere: for alert labels, for annotations, etc.
In Prometheus, it would be overriden for alert's labels only, but in annotations
the original label value would be available.
See more details here https://github.com/prometheus/compliance/issues/80
Signed-off-by: hagen1778 <roman@victoriametrics.com>
2022-04-06 20:24:45 +02:00
|
|
|
h := hash(ls.processed)
|
2020-11-09 23:27:32 +01:00
|
|
|
if _, ok := updated[h]; ok {
|
|
|
|
// duplicate may be caused by extra labels
|
|
|
|
// conflicting with the metric labels
|
2023-10-13 13:54:33 +02:00
|
|
|
curState.Err = fmt.Errorf("labels %v: %w", ls.processed, errDuplicate)
|
|
|
|
return nil, curState.Err
|
2020-11-09 23:27:32 +01:00
|
|
|
}
|
2020-06-01 12:46:37 +02:00
|
|
|
updated[h] = struct{}{}
|
|
|
|
if a, ok := ar.alerts[h]; ok {
|
2022-03-29 15:09:07 +02:00
|
|
|
if a.State == notifier.StateInactive {
|
|
|
|
// alert could be in inactive state for resolvedRetention
|
|
|
|
// so when we again receive metrics for it - we switch it
|
|
|
|
// back to notifier.StatePending
|
|
|
|
a.State = notifier.StatePending
|
|
|
|
a.ActiveAt = ts
|
2022-09-13 15:33:00 +02:00
|
|
|
ar.logDebugf(ts, a, "INACTIVE => PENDING")
|
2022-03-29 15:09:07 +02:00
|
|
|
}
|
2022-09-16 16:19:10 +02:00
|
|
|
a.Value = m.Values[0]
|
|
|
|
// re-exec template since Value or query can be used in annotations
|
|
|
|
a.Annotations, err = a.ExecTemplate(qFn, ls.origin, ar.Annotations)
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
2020-06-01 12:46:37 +02:00
|
|
|
}
|
2023-07-27 15:13:13 +02:00
|
|
|
a.KeepFiringSince = time.Time{}
|
2020-06-01 12:46:37 +02:00
|
|
|
continue
|
|
|
|
}
|
2022-09-14 14:04:24 +02:00
|
|
|
a, err := ar.newAlert(m, ls, start, qFn)
|
2020-06-01 12:46:37 +02:00
|
|
|
if err != nil {
|
2023-10-13 13:54:33 +02:00
|
|
|
curState.Err = fmt.Errorf("failed to create alert: %w", err)
|
|
|
|
return nil, curState.Err
|
2020-06-01 12:46:37 +02:00
|
|
|
}
|
|
|
|
a.ID = h
|
|
|
|
a.State = notifier.StatePending
|
2022-03-29 15:09:07 +02:00
|
|
|
a.ActiveAt = ts
|
2020-06-01 12:46:37 +02:00
|
|
|
ar.alerts[h] = a
|
2022-09-13 15:33:00 +02:00
|
|
|
ar.logDebugf(ts, a, "created in state PENDING")
|
2020-06-01 12:46:37 +02:00
|
|
|
}
|
2022-06-09 08:21:30 +02:00
|
|
|
var numActivePending int
|
2020-06-01 12:46:37 +02:00
|
|
|
for h, a := range ar.alerts {
|
|
|
|
// if alert wasn't updated in this iteration
|
|
|
|
// means it is resolved already
|
|
|
|
if _, ok := updated[h]; !ok {
|
|
|
|
if a.State == notifier.StatePending {
|
|
|
|
// alert was in Pending state - it is not
|
|
|
|
// active anymore
|
|
|
|
delete(ar.alerts, h)
|
2022-09-13 15:33:00 +02:00
|
|
|
ar.logDebugf(ts, a, "PENDING => DELETED: is absent in current evaluation round")
|
2020-06-01 12:46:37 +02:00
|
|
|
continue
|
|
|
|
}
|
2023-07-27 15:13:13 +02:00
|
|
|
// check if alert should keep StateFiring if rule has
|
|
|
|
// `keep_firing_for` field
|
2022-03-29 15:09:07 +02:00
|
|
|
if a.State == notifier.StateFiring {
|
2023-07-27 15:13:13 +02:00
|
|
|
if ar.KeepFiringFor > 0 {
|
|
|
|
if a.KeepFiringSince.IsZero() {
|
|
|
|
a.KeepFiringSince = ts
|
|
|
|
}
|
|
|
|
}
|
|
|
|
// alerts with ar.KeepFiringFor>0 may remain FIRING
|
|
|
|
// even if their expression isn't true anymore
|
|
|
|
if ts.Sub(a.KeepFiringSince) > ar.KeepFiringFor {
|
|
|
|
a.State = notifier.StateInactive
|
|
|
|
a.ResolvedAt = ts
|
|
|
|
ar.logDebugf(ts, a, "FIRING => INACTIVE: is absent in current evaluation round")
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
ar.logDebugf(ts, a, "KEEP_FIRING: will keep firing for %fs since %v", ar.KeepFiringFor.Seconds(), a.KeepFiringSince)
|
2022-03-29 15:09:07 +02:00
|
|
|
}
|
2020-06-01 12:46:37 +02:00
|
|
|
}
|
2022-06-09 08:21:30 +02:00
|
|
|
numActivePending++
|
2022-05-09 10:11:06 +02:00
|
|
|
if a.State == notifier.StatePending && ts.Sub(a.ActiveAt) >= ar.For {
|
2020-06-01 12:46:37 +02:00
|
|
|
a.State = notifier.StateFiring
|
2022-03-29 15:09:07 +02:00
|
|
|
a.Start = ts
|
2020-06-01 12:46:37 +02:00
|
|
|
alertsFired.Inc()
|
2022-09-13 15:33:00 +02:00
|
|
|
ar.logDebugf(ts, a, "PENDING => FIRING: %s since becoming active at %v", ts.Sub(a.ActiveAt), a.ActiveAt)
|
2020-06-01 12:46:37 +02:00
|
|
|
}
|
|
|
|
}
|
2022-06-09 08:21:30 +02:00
|
|
|
if limit > 0 && numActivePending > limit {
|
|
|
|
ar.alerts = map[uint64]*notifier.Alert{}
|
2023-10-13 13:54:33 +02:00
|
|
|
curState.Err = fmt.Errorf("exec exceeded limit of %d with %d alerts", limit, numActivePending)
|
|
|
|
return nil, curState.Err
|
2022-06-09 08:21:30 +02:00
|
|
|
}
|
2022-03-29 15:09:07 +02:00
|
|
|
return ar.toTimeSeries(ts.Unix()), nil
|
2020-06-01 12:46:37 +02:00
|
|
|
}
|
|
|
|
|
2021-06-09 11:20:38 +02:00
|
|
|
func (ar *AlertingRule) toTimeSeries(timestamp int64) []prompbmarshal.TimeSeries {
|
2020-06-01 12:46:37 +02:00
|
|
|
var tss []prompbmarshal.TimeSeries
|
|
|
|
for _, a := range ar.alerts {
|
|
|
|
if a.State == notifier.StateInactive {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
ts := ar.alertToTimeSeries(a, timestamp)
|
|
|
|
tss = append(tss, ts...)
|
|
|
|
}
|
|
|
|
return tss
|
|
|
|
}
|
|
|
|
|
|
|
|
// TODO: consider hashing algorithm in VM
|
vmalert: fix labels and annotations processing for alerts (#2403)
To improve compatibility with Prometheus alerting the order of
templates processing has changed.
Before, vmalert did all labels processing beforehand. It meant
all extra labels (such as `alertname`, `alertgroup` or rule labels)
were available in templating. All collisions were resolved in favour
of extra labels.
In Prometheus, only labels from the received metric are available in
templating, so no collisions are possible.
This change makes vmalert's behaviour similar to Prometheus.
For example, consider alerting rule which is triggered by time series
with `alertname` label. In vmalert, this label would be overriden
by alerting rule's name everywhere: for alert labels, for annotations, etc.
In Prometheus, it would be overriden for alert's labels only, but in annotations
the original label value would be available.
See more details here https://github.com/prometheus/compliance/issues/80
Signed-off-by: hagen1778 <roman@victoriametrics.com>
2022-04-06 20:24:45 +02:00
|
|
|
func hash(labels map[string]string) uint64 {
|
2020-06-01 12:46:37 +02:00
|
|
|
hash := fnv.New64a()
|
vmalert: fix labels and annotations processing for alerts (#2403)
To improve compatibility with Prometheus alerting the order of
templates processing has changed.
Before, vmalert did all labels processing beforehand. It meant
all extra labels (such as `alertname`, `alertgroup` or rule labels)
were available in templating. All collisions were resolved in favour
of extra labels.
In Prometheus, only labels from the received metric are available in
templating, so no collisions are possible.
This change makes vmalert's behaviour similar to Prometheus.
For example, consider alerting rule which is triggered by time series
with `alertname` label. In vmalert, this label would be overriden
by alerting rule's name everywhere: for alert labels, for annotations, etc.
In Prometheus, it would be overriden for alert's labels only, but in annotations
the original label value would be available.
See more details here https://github.com/prometheus/compliance/issues/80
Signed-off-by: hagen1778 <roman@victoriametrics.com>
2022-04-06 20:24:45 +02:00
|
|
|
keys := make([]string, 0, len(labels))
|
|
|
|
for k := range labels {
|
|
|
|
keys = append(keys, k)
|
|
|
|
}
|
|
|
|
sort.Strings(keys)
|
|
|
|
for _, k := range keys {
|
2020-06-01 12:46:37 +02:00
|
|
|
// drop __name__ to be consistent with Prometheus alerting
|
vmalert: fix labels and annotations processing for alerts (#2403)
To improve compatibility with Prometheus alerting the order of
templates processing has changed.
Before, vmalert did all labels processing beforehand. It meant
all extra labels (such as `alertname`, `alertgroup` or rule labels)
were available in templating. All collisions were resolved in favour
of extra labels.
In Prometheus, only labels from the received metric are available in
templating, so no collisions are possible.
This change makes vmalert's behaviour similar to Prometheus.
For example, consider alerting rule which is triggered by time series
with `alertname` label. In vmalert, this label would be overriden
by alerting rule's name everywhere: for alert labels, for annotations, etc.
In Prometheus, it would be overriden for alert's labels only, but in annotations
the original label value would be available.
See more details here https://github.com/prometheus/compliance/issues/80
Signed-off-by: hagen1778 <roman@victoriametrics.com>
2022-04-06 20:24:45 +02:00
|
|
|
if k == "__name__" {
|
2020-06-01 12:46:37 +02:00
|
|
|
continue
|
|
|
|
}
|
vmalert: fix labels and annotations processing for alerts (#2403)
To improve compatibility with Prometheus alerting the order of
templates processing has changed.
Before, vmalert did all labels processing beforehand. It meant
all extra labels (such as `alertname`, `alertgroup` or rule labels)
were available in templating. All collisions were resolved in favour
of extra labels.
In Prometheus, only labels from the received metric are available in
templating, so no collisions are possible.
This change makes vmalert's behaviour similar to Prometheus.
For example, consider alerting rule which is triggered by time series
with `alertname` label. In vmalert, this label would be overriden
by alerting rule's name everywhere: for alert labels, for annotations, etc.
In Prometheus, it would be overriden for alert's labels only, but in annotations
the original label value would be available.
See more details here https://github.com/prometheus/compliance/issues/80
Signed-off-by: hagen1778 <roman@victoriametrics.com>
2022-04-06 20:24:45 +02:00
|
|
|
name, value := k, labels[k]
|
|
|
|
hash.Write([]byte(name))
|
|
|
|
hash.Write([]byte(value))
|
2020-06-01 12:46:37 +02:00
|
|
|
hash.Write([]byte("\xff"))
|
|
|
|
}
|
|
|
|
return hash.Sum64()
|
|
|
|
}
|
|
|
|
|
2022-05-14 11:38:44 +02:00
|
|
|
func (ar *AlertingRule) newAlert(m datasource.Metric, ls *labelSet, start time.Time, qFn templates.QueryFn) (*notifier.Alert, error) {
|
vmalert: fix labels and annotations processing for alerts (#2403)
To improve compatibility with Prometheus alerting the order of
templates processing has changed.
Before, vmalert did all labels processing beforehand. It meant
all extra labels (such as `alertname`, `alertgroup` or rule labels)
were available in templating. All collisions were resolved in favour
of extra labels.
In Prometheus, only labels from the received metric are available in
templating, so no collisions are possible.
This change makes vmalert's behaviour similar to Prometheus.
For example, consider alerting rule which is triggered by time series
with `alertname` label. In vmalert, this label would be overriden
by alerting rule's name everywhere: for alert labels, for annotations, etc.
In Prometheus, it would be overriden for alert's labels only, but in annotations
the original label value would be available.
See more details here https://github.com/prometheus/compliance/issues/80
Signed-off-by: hagen1778 <roman@victoriametrics.com>
2022-04-06 20:24:45 +02:00
|
|
|
var err error
|
|
|
|
if ls == nil {
|
|
|
|
ls, err = ar.toLabels(m, qFn)
|
|
|
|
if err != nil {
|
2023-10-25 21:24:01 +02:00
|
|
|
return nil, fmt.Errorf("failed to expand labels: %w", err)
|
vmalert: fix labels and annotations processing for alerts (#2403)
To improve compatibility with Prometheus alerting the order of
templates processing has changed.
Before, vmalert did all labels processing beforehand. It meant
all extra labels (such as `alertname`, `alertgroup` or rule labels)
were available in templating. All collisions were resolved in favour
of extra labels.
In Prometheus, only labels from the received metric are available in
templating, so no collisions are possible.
This change makes vmalert's behaviour similar to Prometheus.
For example, consider alerting rule which is triggered by time series
with `alertname` label. In vmalert, this label would be overriden
by alerting rule's name everywhere: for alert labels, for annotations, etc.
In Prometheus, it would be overriden for alert's labels only, but in annotations
the original label value would be available.
See more details here https://github.com/prometheus/compliance/issues/80
Signed-off-by: hagen1778 <roman@victoriametrics.com>
2022-04-06 20:24:45 +02:00
|
|
|
}
|
|
|
|
}
|
2020-06-01 12:46:37 +02:00
|
|
|
a := ¬ifier.Alert{
|
2022-03-29 15:09:07 +02:00
|
|
|
GroupID: ar.GroupID,
|
|
|
|
Name: ar.Name,
|
vmalert: fix labels and annotations processing for alerts (#2403)
To improve compatibility with Prometheus alerting the order of
templates processing has changed.
Before, vmalert did all labels processing beforehand. It meant
all extra labels (such as `alertname`, `alertgroup` or rule labels)
were available in templating. All collisions were resolved in favour
of extra labels.
In Prometheus, only labels from the received metric are available in
templating, so no collisions are possible.
This change makes vmalert's behaviour similar to Prometheus.
For example, consider alerting rule which is triggered by time series
with `alertname` label. In vmalert, this label would be overriden
by alerting rule's name everywhere: for alert labels, for annotations, etc.
In Prometheus, it would be overriden for alert's labels only, but in annotations
the original label value would be available.
See more details here https://github.com/prometheus/compliance/issues/80
Signed-off-by: hagen1778 <roman@victoriametrics.com>
2022-04-06 20:24:45 +02:00
|
|
|
Labels: ls.processed,
|
2022-03-29 15:09:07 +02:00
|
|
|
Value: m.Values[0],
|
|
|
|
ActiveAt: start,
|
|
|
|
Expr: ar.Expr,
|
2022-12-12 20:16:10 +01:00
|
|
|
For: ar.For,
|
2020-06-01 12:46:37 +02:00
|
|
|
}
|
vmalert: fix labels and annotations processing for alerts (#2403)
To improve compatibility with Prometheus alerting the order of
templates processing has changed.
Before, vmalert did all labels processing beforehand. It meant
all extra labels (such as `alertname`, `alertgroup` or rule labels)
were available in templating. All collisions were resolved in favour
of extra labels.
In Prometheus, only labels from the received metric are available in
templating, so no collisions are possible.
This change makes vmalert's behaviour similar to Prometheus.
For example, consider alerting rule which is triggered by time series
with `alertname` label. In vmalert, this label would be overriden
by alerting rule's name everywhere: for alert labels, for annotations, etc.
In Prometheus, it would be overriden for alert's labels only, but in annotations
the original label value would be available.
See more details here https://github.com/prometheus/compliance/issues/80
Signed-off-by: hagen1778 <roman@victoriametrics.com>
2022-04-06 20:24:45 +02:00
|
|
|
a.Annotations, err = a.ExecTemplate(qFn, ls.origin, ar.Annotations)
|
2020-12-19 13:10:59 +01:00
|
|
|
return a, err
|
2020-06-01 12:46:37 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
const (
|
2020-09-11 21:52:56 +02:00
|
|
|
// alertMetricName is the metric name for synthetic alert timeseries.
|
2020-06-01 12:46:37 +02:00
|
|
|
alertMetricName = "ALERTS"
|
2020-09-11 21:52:56 +02:00
|
|
|
// alertForStateMetricName is the metric name for 'for' state of alert.
|
2020-06-01 12:46:37 +02:00
|
|
|
alertForStateMetricName = "ALERTS_FOR_STATE"
|
|
|
|
|
2020-09-11 21:52:56 +02:00
|
|
|
// alertNameLabel is the label name indicating the name of an alert.
|
2020-06-01 12:46:37 +02:00
|
|
|
alertNameLabel = "alertname"
|
2020-09-11 21:52:56 +02:00
|
|
|
// alertStateLabel is the label name indicating the state of an alert.
|
2020-06-01 12:46:37 +02:00
|
|
|
alertStateLabel = "alertstate"
|
2020-09-11 21:52:56 +02:00
|
|
|
|
|
|
|
// alertGroupNameLabel defines the label name attached for generated time series.
|
2021-10-22 11:30:38 +02:00
|
|
|
// attaching this label may be disabled via `-disableAlertgroupLabel` flag.
|
2020-09-11 21:52:56 +02:00
|
|
|
alertGroupNameLabel = "alertgroup"
|
2020-06-01 12:46:37 +02:00
|
|
|
)
|
|
|
|
|
2022-03-29 15:09:07 +02:00
|
|
|
// alertToTimeSeries converts the given alert with the given timestamp to time series
|
2021-06-09 11:20:38 +02:00
|
|
|
func (ar *AlertingRule) alertToTimeSeries(a *notifier.Alert, timestamp int64) []prompbmarshal.TimeSeries {
|
2020-06-01 12:46:37 +02:00
|
|
|
var tss []prompbmarshal.TimeSeries
|
2021-10-22 11:30:38 +02:00
|
|
|
tss = append(tss, alertToTimeSeries(a, timestamp))
|
2020-06-01 12:46:37 +02:00
|
|
|
if ar.For > 0 {
|
2021-10-22 11:30:38 +02:00
|
|
|
tss = append(tss, alertForToTimeSeries(a, timestamp))
|
2020-06-01 12:46:37 +02:00
|
|
|
}
|
|
|
|
return tss
|
|
|
|
}
|
|
|
|
|
2021-10-22 11:30:38 +02:00
|
|
|
func alertToTimeSeries(a *notifier.Alert, timestamp int64) prompbmarshal.TimeSeries {
|
2020-06-01 12:46:37 +02:00
|
|
|
labels := make(map[string]string)
|
|
|
|
for k, v := range a.Labels {
|
|
|
|
labels[k] = v
|
|
|
|
}
|
|
|
|
labels["__name__"] = alertMetricName
|
|
|
|
labels[alertStateLabel] = a.State.String()
|
2021-06-09 11:20:38 +02:00
|
|
|
return newTimeSeries([]float64{1}, []int64{timestamp}, labels)
|
2020-06-01 12:46:37 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
// alertForToTimeSeries returns a timeseries that represents
|
|
|
|
// state of active alerts, where value is time when alert become active
|
2021-10-22 11:30:38 +02:00
|
|
|
func alertForToTimeSeries(a *notifier.Alert, timestamp int64) prompbmarshal.TimeSeries {
|
2020-06-01 12:46:37 +02:00
|
|
|
labels := make(map[string]string)
|
|
|
|
for k, v := range a.Labels {
|
|
|
|
labels[k] = v
|
|
|
|
}
|
|
|
|
labels["__name__"] = alertForStateMetricName
|
2022-03-29 15:09:07 +02:00
|
|
|
return newTimeSeries([]float64{float64(a.ActiveAt.Unix())}, []int64{timestamp}, labels)
|
2020-06-01 12:46:37 +02:00
|
|
|
}
|
|
|
|
|
2023-10-13 13:54:33 +02:00
|
|
|
// restore restores the value of ActiveAt field for active alerts,
|
2023-02-04 04:46:13 +01:00
|
|
|
// based on previously written time series `alertForStateMetricName`.
|
|
|
|
// Only rules with For > 0 can be restored.
|
2023-10-13 13:54:33 +02:00
|
|
|
func (ar *AlertingRule) restore(ctx context.Context, q datasource.Querier, ts time.Time, lookback time.Duration) error {
|
2023-02-04 04:46:13 +01:00
|
|
|
if ar.For < 1 {
|
|
|
|
return nil
|
2022-09-15 12:40:22 +02:00
|
|
|
}
|
2020-12-14 19:11:45 +01:00
|
|
|
|
2023-02-04 04:46:13 +01:00
|
|
|
ar.alertsMu.Lock()
|
|
|
|
defer ar.alertsMu.Unlock()
|
2020-07-28 13:20:31 +02:00
|
|
|
|
2023-02-04 04:46:13 +01:00
|
|
|
if len(ar.alerts) < 1 {
|
|
|
|
return nil
|
2020-06-01 12:46:37 +02:00
|
|
|
}
|
|
|
|
|
2023-02-04 04:46:13 +01:00
|
|
|
for _, a := range ar.alerts {
|
|
|
|
if a.Restored || a.State != notifier.StatePending {
|
|
|
|
continue
|
vmalert: fix labels and annotations processing for alerts (#2403)
To improve compatibility with Prometheus alerting the order of
templates processing has changed.
Before, vmalert did all labels processing beforehand. It meant
all extra labels (such as `alertname`, `alertgroup` or rule labels)
were available in templating. All collisions were resolved in favour
of extra labels.
In Prometheus, only labels from the received metric are available in
templating, so no collisions are possible.
This change makes vmalert's behaviour similar to Prometheus.
For example, consider alerting rule which is triggered by time series
with `alertname` label. In vmalert, this label would be overriden
by alerting rule's name everywhere: for alert labels, for annotations, etc.
In Prometheus, it would be overriden for alert's labels only, but in annotations
the original label value would be available.
See more details here https://github.com/prometheus/compliance/issues/80
Signed-off-by: hagen1778 <roman@victoriametrics.com>
2022-04-06 20:24:45 +02:00
|
|
|
}
|
2023-02-04 04:46:13 +01:00
|
|
|
|
|
|
|
var labelsFilter []string
|
|
|
|
for k, v := range a.Labels {
|
|
|
|
labelsFilter = append(labelsFilter, fmt.Sprintf("%s=%q", k, v))
|
vmalert: fix labels and annotations processing for alerts (#2403)
To improve compatibility with Prometheus alerting the order of
templates processing has changed.
Before, vmalert did all labels processing beforehand. It meant
all extra labels (such as `alertname`, `alertgroup` or rule labels)
were available in templating. All collisions were resolved in favour
of extra labels.
In Prometheus, only labels from the received metric are available in
templating, so no collisions are possible.
This change makes vmalert's behaviour similar to Prometheus.
For example, consider alerting rule which is triggered by time series
with `alertname` label. In vmalert, this label would be overriden
by alerting rule's name everywhere: for alert labels, for annotations, etc.
In Prometheus, it would be overriden for alert's labels only, but in annotations
the original label value would be available.
See more details here https://github.com/prometheus/compliance/issues/80
Signed-off-by: hagen1778 <roman@victoriametrics.com>
2022-04-06 20:24:45 +02:00
|
|
|
}
|
2023-02-04 04:46:13 +01:00
|
|
|
sort.Strings(labelsFilter)
|
|
|
|
expr := fmt.Sprintf("last_over_time(%s{%s}[%ds])",
|
|
|
|
alertForStateMetricName, strings.Join(labelsFilter, ","), int(lookback.Seconds()))
|
|
|
|
|
|
|
|
ar.logDebugf(ts, nil, "restoring alert state via query %q", expr)
|
|
|
|
|
2023-05-08 09:36:39 +02:00
|
|
|
res, _, err := q.Query(ctx, expr, ts)
|
2020-06-01 12:46:37 +02:00
|
|
|
if err != nil {
|
2023-02-04 04:46:13 +01:00
|
|
|
return err
|
2020-06-01 12:46:37 +02:00
|
|
|
}
|
2023-02-04 04:46:13 +01:00
|
|
|
|
2023-05-08 09:36:39 +02:00
|
|
|
qMetrics := res.Data
|
2023-02-04 04:46:13 +01:00
|
|
|
if len(qMetrics) < 1 {
|
|
|
|
ar.logDebugf(ts, nil, "no response was received from restore query")
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
|
|
|
// only one series expected in response
|
|
|
|
m := qMetrics[0]
|
|
|
|
// __name__ supposed to be alertForStateMetricName
|
|
|
|
m.DelLabel("__name__")
|
|
|
|
|
|
|
|
// we assume that restore query contains all label matchers,
|
|
|
|
// so all received labels will match anyway if their number is equal.
|
|
|
|
if len(m.Labels) != len(a.Labels) {
|
|
|
|
ar.logDebugf(ts, nil, "state restore query returned not expected label-set %v", m.Labels)
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
a.ActiveAt = time.Unix(int64(m.Values[0]), 0)
|
2021-10-22 11:30:38 +02:00
|
|
|
a.Restored = true
|
vmalert: fix labels and annotations processing for alerts (#2403)
To improve compatibility with Prometheus alerting the order of
templates processing has changed.
Before, vmalert did all labels processing beforehand. It meant
all extra labels (such as `alertname`, `alertgroup` or rule labels)
were available in templating. All collisions were resolved in favour
of extra labels.
In Prometheus, only labels from the received metric are available in
templating, so no collisions are possible.
This change makes vmalert's behaviour similar to Prometheus.
For example, consider alerting rule which is triggered by time series
with `alertname` label. In vmalert, this label would be overriden
by alerting rule's name everywhere: for alert labels, for annotations, etc.
In Prometheus, it would be overriden for alert's labels only, but in annotations
the original label value would be available.
See more details here https://github.com/prometheus/compliance/issues/80
Signed-off-by: hagen1778 <roman@victoriametrics.com>
2022-04-06 20:24:45 +02:00
|
|
|
logger.Infof("alert %q (%d) restored to state at %v", a.Name, a.ID, a.ActiveAt)
|
2020-06-01 12:46:37 +02:00
|
|
|
}
|
|
|
|
return nil
|
|
|
|
}
|
2022-03-16 16:26:33 +01:00
|
|
|
|
|
|
|
// alertsToSend walks through the current alerts of AlertingRule
|
|
|
|
// and returns only those which should be sent to notifier.
|
|
|
|
// Isn't concurrent safe.
|
|
|
|
func (ar *AlertingRule) alertsToSend(ts time.Time, resolveDuration, resendDelay time.Duration) []notifier.Alert {
|
2022-03-29 15:09:07 +02:00
|
|
|
needsSending := func(a *notifier.Alert) bool {
|
|
|
|
if a.State == notifier.StatePending {
|
|
|
|
return false
|
|
|
|
}
|
|
|
|
if a.ResolvedAt.After(a.LastSent) {
|
|
|
|
return true
|
|
|
|
}
|
|
|
|
return a.LastSent.Add(resendDelay).Before(ts)
|
|
|
|
}
|
|
|
|
|
2022-03-16 16:26:33 +01:00
|
|
|
var alerts []notifier.Alert
|
|
|
|
for _, a := range ar.alerts {
|
2022-03-29 15:09:07 +02:00
|
|
|
if !needsSending(a) {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
a.End = ts.Add(resolveDuration)
|
|
|
|
if a.State == notifier.StateInactive {
|
|
|
|
a.End = a.ResolvedAt
|
2022-03-16 16:26:33 +01:00
|
|
|
}
|
2022-03-29 15:09:07 +02:00
|
|
|
a.LastSent = ts
|
|
|
|
alerts = append(alerts, *a)
|
2022-03-16 16:26:33 +01:00
|
|
|
}
|
|
|
|
return alerts
|
|
|
|
}
|