2020-06-01 12:46:37 +02:00
|
|
|
package main
|
|
|
|
|
|
|
|
import (
|
|
|
|
"context"
|
|
|
|
"fmt"
|
|
|
|
"hash/fnv"
|
|
|
|
"sort"
|
|
|
|
"strconv"
|
|
|
|
"sync"
|
|
|
|
"time"
|
|
|
|
|
|
|
|
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/config"
|
|
|
|
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
|
|
|
|
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
|
|
|
|
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
|
|
|
|
"github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal"
|
app/vmalert: extend metrics set exported by `vmalert` #573 (#654)
* app/vmalert: extend metrics set exported by `vmalert` #573
New metrics were added to improve observability:
+ vmalert_alerts_pending{alertname, group} - number of pending alerts per group
per alert;
+ vmalert_alerts_acitve{alertname, group} - number of active alerts per group
per alert;
+ vmalert_alerts_error{alertname, group} - is 1 if alertname ended up with error
during prev execution, is 0 if no errors happened;
+ vmalert_recording_rules_error{recording, group} - is 1 if recording rule
ended up with error during prev execution, is 0 if no errors happened;
* vmalert_iteration_total{group, file} - now contains group and file name labels.
This should improve control over specific groups;
* vmalert_iteration_duration_seconds{group, file} - now contains group and file name labels. This should improve control over specific groups;
Some collisions for alerts and recording rules are possible, because neither
group name nor alert/recording rule name are unique for compatibility reasons.
Commit contains list of TODOs for Unregistering metrics since groups and rules
are ephemeral and could be removed without application restart. In order to
unlock Unregistering feature corresponding PR was filed - https://github.com/VictoriaMetrics/metrics/pull/13
* app/vmalert: extend metrics set exported by `vmalert` #573
The changes are following:
* add an ID label to rules metrics, since `name` collisions within one group is
a common case - see the k8s example alerts;
* supports metrics unregistering on rule updates. Consider the case when one rule
was added or removed from the group, or the whole group was added or removed.
The change depends on https://github.com/VictoriaMetrics/metrics/pull/16
where race condition for Unregister method was fixed.
2020-08-09 08:41:29 +02:00
|
|
|
"github.com/VictoriaMetrics/metrics"
|
2020-06-01 12:46:37 +02:00
|
|
|
)
|
|
|
|
|
|
|
|
// AlertingRule is basic alert entity
|
|
|
|
type AlertingRule struct {
|
2021-06-09 11:20:38 +02:00
|
|
|
Type datasource.Type
|
|
|
|
RuleID uint64
|
|
|
|
Name string
|
|
|
|
Expr string
|
|
|
|
For time.Duration
|
|
|
|
Labels map[string]string
|
|
|
|
Annotations map[string]string
|
|
|
|
GroupID uint64
|
|
|
|
GroupName string
|
|
|
|
EvalInterval time.Duration
|
2020-06-01 12:46:37 +02:00
|
|
|
|
2021-04-28 22:41:15 +02:00
|
|
|
q datasource.Querier
|
|
|
|
|
2020-06-01 12:46:37 +02:00
|
|
|
// guard status fields
|
|
|
|
mu sync.RWMutex
|
|
|
|
// stores list of active alerts
|
|
|
|
alerts map[uint64]*notifier.Alert
|
|
|
|
// stores last moment of time Exec was called
|
|
|
|
lastExecTime time.Time
|
|
|
|
// stores last error that happened in Exec func
|
|
|
|
// resets on every successful Exec
|
|
|
|
// may be used as Health state
|
|
|
|
lastExecError error
|
2021-08-05 08:59:46 +02:00
|
|
|
// stores the number of samples returned during
|
|
|
|
// the last evaluation
|
|
|
|
lastExecSamples int
|
app/vmalert: extend metrics set exported by `vmalert` #573 (#654)
* app/vmalert: extend metrics set exported by `vmalert` #573
New metrics were added to improve observability:
+ vmalert_alerts_pending{alertname, group} - number of pending alerts per group
per alert;
+ vmalert_alerts_acitve{alertname, group} - number of active alerts per group
per alert;
+ vmalert_alerts_error{alertname, group} - is 1 if alertname ended up with error
during prev execution, is 0 if no errors happened;
+ vmalert_recording_rules_error{recording, group} - is 1 if recording rule
ended up with error during prev execution, is 0 if no errors happened;
* vmalert_iteration_total{group, file} - now contains group and file name labels.
This should improve control over specific groups;
* vmalert_iteration_duration_seconds{group, file} - now contains group and file name labels. This should improve control over specific groups;
Some collisions for alerts and recording rules are possible, because neither
group name nor alert/recording rule name are unique for compatibility reasons.
Commit contains list of TODOs for Unregistering metrics since groups and rules
are ephemeral and could be removed without application restart. In order to
unlock Unregistering feature corresponding PR was filed - https://github.com/VictoriaMetrics/metrics/pull/13
* app/vmalert: extend metrics set exported by `vmalert` #573
The changes are following:
* add an ID label to rules metrics, since `name` collisions within one group is
a common case - see the k8s example alerts;
* supports metrics unregistering on rule updates. Consider the case when one rule
was added or removed from the group, or the whole group was added or removed.
The change depends on https://github.com/VictoriaMetrics/metrics/pull/16
where race condition for Unregister method was fixed.
2020-08-09 08:41:29 +02:00
|
|
|
|
|
|
|
metrics *alertingRuleMetrics
|
|
|
|
}
|
|
|
|
|
|
|
|
type alertingRuleMetrics struct {
|
|
|
|
errors *gauge
|
|
|
|
pending *gauge
|
|
|
|
active *gauge
|
2021-08-05 08:59:46 +02:00
|
|
|
samples *gauge
|
2020-06-01 12:46:37 +02:00
|
|
|
}
|
|
|
|
|
2021-04-28 22:41:15 +02:00
|
|
|
func newAlertingRule(qb datasource.QuerierBuilder, group *Group, cfg config.Rule) *AlertingRule {
|
app/vmalert: extend metrics set exported by `vmalert` #573 (#654)
* app/vmalert: extend metrics set exported by `vmalert` #573
New metrics were added to improve observability:
+ vmalert_alerts_pending{alertname, group} - number of pending alerts per group
per alert;
+ vmalert_alerts_acitve{alertname, group} - number of active alerts per group
per alert;
+ vmalert_alerts_error{alertname, group} - is 1 if alertname ended up with error
during prev execution, is 0 if no errors happened;
+ vmalert_recording_rules_error{recording, group} - is 1 if recording rule
ended up with error during prev execution, is 0 if no errors happened;
* vmalert_iteration_total{group, file} - now contains group and file name labels.
This should improve control over specific groups;
* vmalert_iteration_duration_seconds{group, file} - now contains group and file name labels. This should improve control over specific groups;
Some collisions for alerts and recording rules are possible, because neither
group name nor alert/recording rule name are unique for compatibility reasons.
Commit contains list of TODOs for Unregistering metrics since groups and rules
are ephemeral and could be removed without application restart. In order to
unlock Unregistering feature corresponding PR was filed - https://github.com/VictoriaMetrics/metrics/pull/13
* app/vmalert: extend metrics set exported by `vmalert` #573
The changes are following:
* add an ID label to rules metrics, since `name` collisions within one group is
a common case - see the k8s example alerts;
* supports metrics unregistering on rule updates. Consider the case when one rule
was added or removed from the group, or the whole group was added or removed.
The change depends on https://github.com/VictoriaMetrics/metrics/pull/16
where race condition for Unregister method was fixed.
2020-08-09 08:41:29 +02:00
|
|
|
ar := &AlertingRule{
|
2021-11-05 18:49:32 +01:00
|
|
|
Type: group.Type,
|
2021-06-09 11:20:38 +02:00
|
|
|
RuleID: cfg.ID,
|
|
|
|
Name: cfg.Alert,
|
|
|
|
Expr: cfg.Expr,
|
|
|
|
For: cfg.For.Duration(),
|
|
|
|
Labels: cfg.Labels,
|
|
|
|
Annotations: cfg.Annotations,
|
|
|
|
GroupID: group.ID(),
|
|
|
|
GroupName: group.Name,
|
|
|
|
EvalInterval: group.Interval,
|
2021-04-30 08:46:03 +02:00
|
|
|
q: qb.BuildWithParams(datasource.QuerierParams{
|
2021-11-05 18:49:32 +01:00
|
|
|
DataSourceType: &group.Type,
|
2021-04-30 08:46:03 +02:00
|
|
|
EvaluationInterval: group.Interval,
|
2021-12-02 13:45:08 +01:00
|
|
|
QueryParams: group.Params,
|
2021-04-30 08:46:03 +02:00
|
|
|
}),
|
|
|
|
alerts: make(map[uint64]*notifier.Alert),
|
|
|
|
metrics: &alertingRuleMetrics{},
|
2020-06-01 12:46:37 +02:00
|
|
|
}
|
app/vmalert: extend metrics set exported by `vmalert` #573 (#654)
* app/vmalert: extend metrics set exported by `vmalert` #573
New metrics were added to improve observability:
+ vmalert_alerts_pending{alertname, group} - number of pending alerts per group
per alert;
+ vmalert_alerts_acitve{alertname, group} - number of active alerts per group
per alert;
+ vmalert_alerts_error{alertname, group} - is 1 if alertname ended up with error
during prev execution, is 0 if no errors happened;
+ vmalert_recording_rules_error{recording, group} - is 1 if recording rule
ended up with error during prev execution, is 0 if no errors happened;
* vmalert_iteration_total{group, file} - now contains group and file name labels.
This should improve control over specific groups;
* vmalert_iteration_duration_seconds{group, file} - now contains group and file name labels. This should improve control over specific groups;
Some collisions for alerts and recording rules are possible, because neither
group name nor alert/recording rule name are unique for compatibility reasons.
Commit contains list of TODOs for Unregistering metrics since groups and rules
are ephemeral and could be removed without application restart. In order to
unlock Unregistering feature corresponding PR was filed - https://github.com/VictoriaMetrics/metrics/pull/13
* app/vmalert: extend metrics set exported by `vmalert` #573
The changes are following:
* add an ID label to rules metrics, since `name` collisions within one group is
a common case - see the k8s example alerts;
* supports metrics unregistering on rule updates. Consider the case when one rule
was added or removed from the group, or the whole group was added or removed.
The change depends on https://github.com/VictoriaMetrics/metrics/pull/16
where race condition for Unregister method was fixed.
2020-08-09 08:41:29 +02:00
|
|
|
|
|
|
|
labels := fmt.Sprintf(`alertname=%q, group=%q, id="%d"`, ar.Name, group.Name, ar.ID())
|
|
|
|
ar.metrics.pending = getOrCreateGauge(fmt.Sprintf(`vmalert_alerts_pending{%s}`, labels),
|
|
|
|
func() float64 {
|
2021-08-05 08:59:46 +02:00
|
|
|
ar.mu.RLock()
|
|
|
|
defer ar.mu.RUnlock()
|
app/vmalert: extend metrics set exported by `vmalert` #573 (#654)
* app/vmalert: extend metrics set exported by `vmalert` #573
New metrics were added to improve observability:
+ vmalert_alerts_pending{alertname, group} - number of pending alerts per group
per alert;
+ vmalert_alerts_acitve{alertname, group} - number of active alerts per group
per alert;
+ vmalert_alerts_error{alertname, group} - is 1 if alertname ended up with error
during prev execution, is 0 if no errors happened;
+ vmalert_recording_rules_error{recording, group} - is 1 if recording rule
ended up with error during prev execution, is 0 if no errors happened;
* vmalert_iteration_total{group, file} - now contains group and file name labels.
This should improve control over specific groups;
* vmalert_iteration_duration_seconds{group, file} - now contains group and file name labels. This should improve control over specific groups;
Some collisions for alerts and recording rules are possible, because neither
group name nor alert/recording rule name are unique for compatibility reasons.
Commit contains list of TODOs for Unregistering metrics since groups and rules
are ephemeral and could be removed without application restart. In order to
unlock Unregistering feature corresponding PR was filed - https://github.com/VictoriaMetrics/metrics/pull/13
* app/vmalert: extend metrics set exported by `vmalert` #573
The changes are following:
* add an ID label to rules metrics, since `name` collisions within one group is
a common case - see the k8s example alerts;
* supports metrics unregistering on rule updates. Consider the case when one rule
was added or removed from the group, or the whole group was added or removed.
The change depends on https://github.com/VictoriaMetrics/metrics/pull/16
where race condition for Unregister method was fixed.
2020-08-09 08:41:29 +02:00
|
|
|
var num int
|
|
|
|
for _, a := range ar.alerts {
|
|
|
|
if a.State == notifier.StatePending {
|
|
|
|
num++
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return float64(num)
|
|
|
|
})
|
|
|
|
ar.metrics.active = getOrCreateGauge(fmt.Sprintf(`vmalert_alerts_firing{%s}`, labels),
|
|
|
|
func() float64 {
|
2021-08-05 08:59:46 +02:00
|
|
|
ar.mu.RLock()
|
|
|
|
defer ar.mu.RUnlock()
|
app/vmalert: extend metrics set exported by `vmalert` #573 (#654)
* app/vmalert: extend metrics set exported by `vmalert` #573
New metrics were added to improve observability:
+ vmalert_alerts_pending{alertname, group} - number of pending alerts per group
per alert;
+ vmalert_alerts_acitve{alertname, group} - number of active alerts per group
per alert;
+ vmalert_alerts_error{alertname, group} - is 1 if alertname ended up with error
during prev execution, is 0 if no errors happened;
+ vmalert_recording_rules_error{recording, group} - is 1 if recording rule
ended up with error during prev execution, is 0 if no errors happened;
* vmalert_iteration_total{group, file} - now contains group and file name labels.
This should improve control over specific groups;
* vmalert_iteration_duration_seconds{group, file} - now contains group and file name labels. This should improve control over specific groups;
Some collisions for alerts and recording rules are possible, because neither
group name nor alert/recording rule name are unique for compatibility reasons.
Commit contains list of TODOs for Unregistering metrics since groups and rules
are ephemeral and could be removed without application restart. In order to
unlock Unregistering feature corresponding PR was filed - https://github.com/VictoriaMetrics/metrics/pull/13
* app/vmalert: extend metrics set exported by `vmalert` #573
The changes are following:
* add an ID label to rules metrics, since `name` collisions within one group is
a common case - see the k8s example alerts;
* supports metrics unregistering on rule updates. Consider the case when one rule
was added or removed from the group, or the whole group was added or removed.
The change depends on https://github.com/VictoriaMetrics/metrics/pull/16
where race condition for Unregister method was fixed.
2020-08-09 08:41:29 +02:00
|
|
|
var num int
|
|
|
|
for _, a := range ar.alerts {
|
|
|
|
if a.State == notifier.StateFiring {
|
|
|
|
num++
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return float64(num)
|
|
|
|
})
|
2021-08-05 08:59:46 +02:00
|
|
|
ar.metrics.errors = getOrCreateGauge(fmt.Sprintf(`vmalert_alerting_rules_error{%s}`, labels),
|
app/vmalert: extend metrics set exported by `vmalert` #573 (#654)
* app/vmalert: extend metrics set exported by `vmalert` #573
New metrics were added to improve observability:
+ vmalert_alerts_pending{alertname, group} - number of pending alerts per group
per alert;
+ vmalert_alerts_acitve{alertname, group} - number of active alerts per group
per alert;
+ vmalert_alerts_error{alertname, group} - is 1 if alertname ended up with error
during prev execution, is 0 if no errors happened;
+ vmalert_recording_rules_error{recording, group} - is 1 if recording rule
ended up with error during prev execution, is 0 if no errors happened;
* vmalert_iteration_total{group, file} - now contains group and file name labels.
This should improve control over specific groups;
* vmalert_iteration_duration_seconds{group, file} - now contains group and file name labels. This should improve control over specific groups;
Some collisions for alerts and recording rules are possible, because neither
group name nor alert/recording rule name are unique for compatibility reasons.
Commit contains list of TODOs for Unregistering metrics since groups and rules
are ephemeral and could be removed without application restart. In order to
unlock Unregistering feature corresponding PR was filed - https://github.com/VictoriaMetrics/metrics/pull/13
* app/vmalert: extend metrics set exported by `vmalert` #573
The changes are following:
* add an ID label to rules metrics, since `name` collisions within one group is
a common case - see the k8s example alerts;
* supports metrics unregistering on rule updates. Consider the case when one rule
was added or removed from the group, or the whole group was added or removed.
The change depends on https://github.com/VictoriaMetrics/metrics/pull/16
where race condition for Unregister method was fixed.
2020-08-09 08:41:29 +02:00
|
|
|
func() float64 {
|
2021-08-05 08:59:46 +02:00
|
|
|
ar.mu.RLock()
|
|
|
|
defer ar.mu.RUnlock()
|
app/vmalert: extend metrics set exported by `vmalert` #573 (#654)
* app/vmalert: extend metrics set exported by `vmalert` #573
New metrics were added to improve observability:
+ vmalert_alerts_pending{alertname, group} - number of pending alerts per group
per alert;
+ vmalert_alerts_acitve{alertname, group} - number of active alerts per group
per alert;
+ vmalert_alerts_error{alertname, group} - is 1 if alertname ended up with error
during prev execution, is 0 if no errors happened;
+ vmalert_recording_rules_error{recording, group} - is 1 if recording rule
ended up with error during prev execution, is 0 if no errors happened;
* vmalert_iteration_total{group, file} - now contains group and file name labels.
This should improve control over specific groups;
* vmalert_iteration_duration_seconds{group, file} - now contains group and file name labels. This should improve control over specific groups;
Some collisions for alerts and recording rules are possible, because neither
group name nor alert/recording rule name are unique for compatibility reasons.
Commit contains list of TODOs for Unregistering metrics since groups and rules
are ephemeral and could be removed without application restart. In order to
unlock Unregistering feature corresponding PR was filed - https://github.com/VictoriaMetrics/metrics/pull/13
* app/vmalert: extend metrics set exported by `vmalert` #573
The changes are following:
* add an ID label to rules metrics, since `name` collisions within one group is
a common case - see the k8s example alerts;
* supports metrics unregistering on rule updates. Consider the case when one rule
was added or removed from the group, or the whole group was added or removed.
The change depends on https://github.com/VictoriaMetrics/metrics/pull/16
where race condition for Unregister method was fixed.
2020-08-09 08:41:29 +02:00
|
|
|
if ar.lastExecError == nil {
|
|
|
|
return 0
|
|
|
|
}
|
|
|
|
return 1
|
|
|
|
})
|
2021-08-05 08:59:46 +02:00
|
|
|
ar.metrics.samples = getOrCreateGauge(fmt.Sprintf(`vmalert_alerting_rules_last_evaluation_samples{%s}`, labels),
|
|
|
|
func() float64 {
|
|
|
|
ar.mu.RLock()
|
|
|
|
defer ar.mu.RUnlock()
|
|
|
|
return float64(ar.lastExecSamples)
|
|
|
|
})
|
app/vmalert: extend metrics set exported by `vmalert` #573 (#654)
* app/vmalert: extend metrics set exported by `vmalert` #573
New metrics were added to improve observability:
+ vmalert_alerts_pending{alertname, group} - number of pending alerts per group
per alert;
+ vmalert_alerts_acitve{alertname, group} - number of active alerts per group
per alert;
+ vmalert_alerts_error{alertname, group} - is 1 if alertname ended up with error
during prev execution, is 0 if no errors happened;
+ vmalert_recording_rules_error{recording, group} - is 1 if recording rule
ended up with error during prev execution, is 0 if no errors happened;
* vmalert_iteration_total{group, file} - now contains group and file name labels.
This should improve control over specific groups;
* vmalert_iteration_duration_seconds{group, file} - now contains group and file name labels. This should improve control over specific groups;
Some collisions for alerts and recording rules are possible, because neither
group name nor alert/recording rule name are unique for compatibility reasons.
Commit contains list of TODOs for Unregistering metrics since groups and rules
are ephemeral and could be removed without application restart. In order to
unlock Unregistering feature corresponding PR was filed - https://github.com/VictoriaMetrics/metrics/pull/13
* app/vmalert: extend metrics set exported by `vmalert` #573
The changes are following:
* add an ID label to rules metrics, since `name` collisions within one group is
a common case - see the k8s example alerts;
* supports metrics unregistering on rule updates. Consider the case when one rule
was added or removed from the group, or the whole group was added or removed.
The change depends on https://github.com/VictoriaMetrics/metrics/pull/16
where race condition for Unregister method was fixed.
2020-08-09 08:41:29 +02:00
|
|
|
return ar
|
|
|
|
}
|
|
|
|
|
|
|
|
// Close unregisters rule metrics
|
|
|
|
func (ar *AlertingRule) Close() {
|
|
|
|
metrics.UnregisterMetric(ar.metrics.active.name)
|
|
|
|
metrics.UnregisterMetric(ar.metrics.pending.name)
|
|
|
|
metrics.UnregisterMetric(ar.metrics.errors.name)
|
2021-08-05 08:59:46 +02:00
|
|
|
metrics.UnregisterMetric(ar.metrics.samples.name)
|
2020-06-01 12:46:37 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
// String implements Stringer interface
|
|
|
|
func (ar *AlertingRule) String() string {
|
|
|
|
return ar.Name
|
|
|
|
}
|
|
|
|
|
|
|
|
// ID returns unique Rule ID
|
|
|
|
// within the parent Group.
|
|
|
|
func (ar *AlertingRule) ID() uint64 {
|
2020-06-15 21:15:47 +02:00
|
|
|
return ar.RuleID
|
2020-06-01 12:46:37 +02:00
|
|
|
}
|
|
|
|
|
2021-06-09 11:20:38 +02:00
|
|
|
// ExecRange executes alerting rule on the given time range similarly to Exec.
|
|
|
|
// It doesn't update internal states of the Rule and meant to be used just
|
|
|
|
// to get time series for backfilling.
|
|
|
|
// It returns ALERT and ALERT_FOR_STATE time series as result.
|
|
|
|
func (ar *AlertingRule) ExecRange(ctx context.Context, start, end time.Time) ([]prompbmarshal.TimeSeries, error) {
|
|
|
|
series, err := ar.q.QueryRange(ctx, ar.Expr, start, end)
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
var result []prompbmarshal.TimeSeries
|
|
|
|
qFn := func(query string) ([]datasource.Metric, error) {
|
|
|
|
return nil, fmt.Errorf("`query` template isn't supported in replay mode")
|
|
|
|
}
|
|
|
|
for _, s := range series {
|
2021-12-10 11:10:26 +01:00
|
|
|
// set additional labels to identify group and rule name
|
|
|
|
if ar.Name != "" {
|
|
|
|
s.SetLabel(alertNameLabel, ar.Name)
|
|
|
|
}
|
|
|
|
if !*disableAlertGroupLabel && ar.GroupName != "" {
|
|
|
|
s.SetLabel(alertGroupNameLabel, ar.GroupName)
|
|
|
|
}
|
2021-06-09 11:20:38 +02:00
|
|
|
// extra labels could contain templates, so we expand them first
|
|
|
|
labels, err := expandLabels(s, qFn, ar)
|
|
|
|
if err != nil {
|
|
|
|
return nil, fmt.Errorf("failed to expand labels: %s", err)
|
|
|
|
}
|
|
|
|
for k, v := range labels {
|
|
|
|
// apply extra labels to datasource
|
|
|
|
// so the hash key will be consistent on restore
|
|
|
|
s.SetLabel(k, v)
|
|
|
|
}
|
|
|
|
a, err := ar.newAlert(s, time.Time{}, qFn) // initial alert
|
|
|
|
if err != nil {
|
|
|
|
return nil, fmt.Errorf("failed to create alert: %s", err)
|
|
|
|
}
|
|
|
|
if ar.For == 0 { // if alert is instant
|
|
|
|
a.State = notifier.StateFiring
|
|
|
|
for i := range s.Values {
|
|
|
|
result = append(result, ar.alertToTimeSeries(a, s.Timestamps[i])...)
|
|
|
|
}
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
|
|
|
// if alert with For > 0
|
|
|
|
prevT := time.Time{}
|
|
|
|
for i := range s.Values {
|
|
|
|
at := time.Unix(s.Timestamps[i], 0)
|
|
|
|
if at.Sub(prevT) > ar.EvalInterval {
|
|
|
|
// reset to Pending if there are gaps > EvalInterval between DPs
|
|
|
|
a.State = notifier.StatePending
|
|
|
|
a.Start = at
|
|
|
|
} else if at.Sub(a.Start) >= ar.For {
|
|
|
|
a.State = notifier.StateFiring
|
|
|
|
}
|
|
|
|
prevT = at
|
|
|
|
result = append(result, ar.alertToTimeSeries(a, s.Timestamps[i])...)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return result, nil
|
|
|
|
}
|
|
|
|
|
2020-06-01 12:46:37 +02:00
|
|
|
// Exec executes AlertingRule expression via the given Querier.
|
|
|
|
// Based on the Querier results AlertingRule maintains notifier.Alerts
|
2021-06-09 11:20:38 +02:00
|
|
|
func (ar *AlertingRule) Exec(ctx context.Context) ([]prompbmarshal.TimeSeries, error) {
|
2021-04-28 22:41:15 +02:00
|
|
|
qMetrics, err := ar.q.Query(ctx, ar.Expr)
|
2020-06-01 12:46:37 +02:00
|
|
|
ar.mu.Lock()
|
|
|
|
defer ar.mu.Unlock()
|
|
|
|
|
|
|
|
ar.lastExecError = err
|
|
|
|
ar.lastExecTime = time.Now()
|
2021-08-05 08:59:46 +02:00
|
|
|
ar.lastExecSamples = len(qMetrics)
|
2020-06-01 12:46:37 +02:00
|
|
|
if err != nil {
|
2020-06-30 21:58:18 +02:00
|
|
|
return nil, fmt.Errorf("failed to execute query %q: %w", ar.Expr, err)
|
2020-06-01 12:46:37 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
for h, a := range ar.alerts {
|
|
|
|
// cleanup inactive alerts from previous Exec
|
|
|
|
if a.State == notifier.StateInactive {
|
|
|
|
delete(ar.alerts, h)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-04-28 22:41:15 +02:00
|
|
|
qFn := func(query string) ([]datasource.Metric, error) { return ar.q.Query(ctx, query) }
|
2020-06-01 12:46:37 +02:00
|
|
|
updated := make(map[uint64]struct{})
|
|
|
|
// update list of active alerts
|
|
|
|
for _, m := range qMetrics {
|
2021-12-10 11:10:26 +01:00
|
|
|
// set additional labels to identify group and rule name
|
|
|
|
if ar.Name != "" {
|
|
|
|
m.SetLabel(alertNameLabel, ar.Name)
|
|
|
|
}
|
|
|
|
if !*disableAlertGroupLabel && ar.GroupName != "" {
|
|
|
|
m.SetLabel(alertGroupNameLabel, ar.GroupName)
|
|
|
|
}
|
2020-12-19 13:10:59 +01:00
|
|
|
// extra labels could contain templates, so we expand them first
|
|
|
|
labels, err := expandLabels(m, qFn, ar)
|
|
|
|
if err != nil {
|
|
|
|
return nil, fmt.Errorf("failed to expand labels: %s", err)
|
|
|
|
}
|
|
|
|
for k, v := range labels {
|
|
|
|
// apply extra labels to datasource
|
|
|
|
// so the hash key will be consistent on restore
|
2020-11-09 23:27:32 +01:00
|
|
|
m.SetLabel(k, v)
|
|
|
|
}
|
2020-06-01 12:46:37 +02:00
|
|
|
h := hash(m)
|
2020-11-09 23:27:32 +01:00
|
|
|
if _, ok := updated[h]; ok {
|
|
|
|
// duplicate may be caused by extra labels
|
|
|
|
// conflicting with the metric labels
|
|
|
|
return nil, fmt.Errorf("labels %v: %w", m.Labels, errDuplicate)
|
|
|
|
}
|
2020-06-01 12:46:37 +02:00
|
|
|
updated[h] = struct{}{}
|
|
|
|
if a, ok := ar.alerts[h]; ok {
|
2021-06-09 11:20:38 +02:00
|
|
|
if a.Value != m.Values[0] {
|
2020-06-01 12:46:37 +02:00
|
|
|
// update Value field with latest value
|
2021-06-09 11:20:38 +02:00
|
|
|
a.Value = m.Values[0]
|
2020-06-01 12:46:37 +02:00
|
|
|
// and re-exec template since Value can be used
|
2020-12-19 13:10:59 +01:00
|
|
|
// in annotations
|
|
|
|
a.Annotations, err = a.ExecTemplate(qFn, ar.Annotations)
|
2020-06-01 12:46:37 +02:00
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
}
|
|
|
|
continue
|
|
|
|
}
|
2020-12-14 19:11:45 +01:00
|
|
|
a, err := ar.newAlert(m, ar.lastExecTime, qFn)
|
2020-06-01 12:46:37 +02:00
|
|
|
if err != nil {
|
|
|
|
ar.lastExecError = err
|
2020-06-30 21:58:18 +02:00
|
|
|
return nil, fmt.Errorf("failed to create alert: %w", err)
|
2020-06-01 12:46:37 +02:00
|
|
|
}
|
|
|
|
a.ID = h
|
|
|
|
a.State = notifier.StatePending
|
|
|
|
ar.alerts[h] = a
|
|
|
|
}
|
|
|
|
|
|
|
|
for h, a := range ar.alerts {
|
|
|
|
// if alert wasn't updated in this iteration
|
|
|
|
// means it is resolved already
|
|
|
|
if _, ok := updated[h]; !ok {
|
|
|
|
if a.State == notifier.StatePending {
|
|
|
|
// alert was in Pending state - it is not
|
|
|
|
// active anymore
|
|
|
|
delete(ar.alerts, h)
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
a.State = notifier.StateInactive
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
if a.State == notifier.StatePending && time.Since(a.Start) >= ar.For {
|
|
|
|
a.State = notifier.StateFiring
|
|
|
|
alertsFired.Inc()
|
|
|
|
}
|
|
|
|
}
|
2021-06-09 11:20:38 +02:00
|
|
|
return ar.toTimeSeries(ar.lastExecTime.Unix()), nil
|
2020-06-01 12:46:37 +02:00
|
|
|
}
|
|
|
|
|
2020-12-19 13:10:59 +01:00
|
|
|
func expandLabels(m datasource.Metric, q notifier.QueryFn, ar *AlertingRule) (map[string]string, error) {
|
|
|
|
metricLabels := make(map[string]string)
|
|
|
|
for _, l := range m.Labels {
|
|
|
|
metricLabels[l.Name] = l.Value
|
|
|
|
}
|
|
|
|
tpl := notifier.AlertTplData{
|
|
|
|
Labels: metricLabels,
|
2021-06-09 11:20:38 +02:00
|
|
|
Value: m.Values[0],
|
2020-12-19 13:10:59 +01:00
|
|
|
Expr: ar.Expr,
|
|
|
|
}
|
|
|
|
return notifier.ExecTemplate(q, ar.Labels, tpl)
|
|
|
|
}
|
|
|
|
|
2021-06-09 11:20:38 +02:00
|
|
|
func (ar *AlertingRule) toTimeSeries(timestamp int64) []prompbmarshal.TimeSeries {
|
2020-06-01 12:46:37 +02:00
|
|
|
var tss []prompbmarshal.TimeSeries
|
|
|
|
for _, a := range ar.alerts {
|
|
|
|
if a.State == notifier.StateInactive {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
ts := ar.alertToTimeSeries(a, timestamp)
|
|
|
|
tss = append(tss, ts...)
|
|
|
|
}
|
|
|
|
return tss
|
|
|
|
}
|
|
|
|
|
2020-06-01 13:34:58 +02:00
|
|
|
// UpdateWith copies all significant fields.
|
2020-06-01 12:46:37 +02:00
|
|
|
// alerts state isn't copied since
|
|
|
|
// it should be updated in next 2 Execs
|
|
|
|
func (ar *AlertingRule) UpdateWith(r Rule) error {
|
|
|
|
nr, ok := r.(*AlertingRule)
|
|
|
|
if !ok {
|
|
|
|
return fmt.Errorf("BUG: attempt to update alerting rule with wrong type %#v", r)
|
|
|
|
}
|
|
|
|
ar.Expr = nr.Expr
|
|
|
|
ar.For = nr.For
|
|
|
|
ar.Labels = nr.Labels
|
|
|
|
ar.Annotations = nr.Annotations
|
2021-06-09 11:20:38 +02:00
|
|
|
ar.EvalInterval = nr.EvalInterval
|
2021-05-22 23:26:01 +02:00
|
|
|
ar.q = nr.q
|
2020-06-01 12:46:37 +02:00
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// TODO: consider hashing algorithm in VM
|
|
|
|
func hash(m datasource.Metric) uint64 {
|
|
|
|
hash := fnv.New64a()
|
|
|
|
labels := m.Labels
|
|
|
|
sort.Slice(labels, func(i, j int) bool {
|
|
|
|
return labels[i].Name < labels[j].Name
|
|
|
|
})
|
|
|
|
for _, l := range labels {
|
|
|
|
// drop __name__ to be consistent with Prometheus alerting
|
|
|
|
if l.Name == "__name__" {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
hash.Write([]byte(l.Name))
|
|
|
|
hash.Write([]byte(l.Value))
|
|
|
|
hash.Write([]byte("\xff"))
|
|
|
|
}
|
|
|
|
return hash.Sum64()
|
|
|
|
}
|
|
|
|
|
2020-12-14 19:11:45 +01:00
|
|
|
func (ar *AlertingRule) newAlert(m datasource.Metric, start time.Time, qFn notifier.QueryFn) (*notifier.Alert, error) {
|
2020-06-01 12:46:37 +02:00
|
|
|
a := ¬ifier.Alert{
|
|
|
|
GroupID: ar.GroupID,
|
|
|
|
Name: ar.Name,
|
|
|
|
Labels: map[string]string{},
|
2021-06-09 11:20:38 +02:00
|
|
|
Value: m.Values[0],
|
2020-06-01 12:46:37 +02:00
|
|
|
Start: start,
|
|
|
|
Expr: ar.Expr,
|
|
|
|
}
|
|
|
|
for _, l := range m.Labels {
|
|
|
|
// drop __name__ to be consistent with Prometheus alerting
|
|
|
|
if l.Name == "__name__" {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
a.Labels[l.Name] = l.Value
|
|
|
|
}
|
2020-11-09 23:27:32 +01:00
|
|
|
var err error
|
2020-12-14 19:11:45 +01:00
|
|
|
a.Annotations, err = a.ExecTemplate(qFn, ar.Annotations)
|
2020-12-19 13:10:59 +01:00
|
|
|
return a, err
|
2020-06-01 12:46:37 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
// AlertAPI generates APIAlert object from alert by its id(hash)
|
|
|
|
func (ar *AlertingRule) AlertAPI(id uint64) *APIAlert {
|
|
|
|
ar.mu.RLock()
|
|
|
|
defer ar.mu.RUnlock()
|
|
|
|
a, ok := ar.alerts[id]
|
|
|
|
if !ok {
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
return ar.newAlertAPI(*a)
|
|
|
|
}
|
|
|
|
|
|
|
|
// RuleAPI returns Rule representation in form
|
|
|
|
// of APIAlertingRule
|
|
|
|
func (ar *AlertingRule) RuleAPI() APIAlertingRule {
|
|
|
|
var lastErr string
|
|
|
|
if ar.lastExecError != nil {
|
|
|
|
lastErr = ar.lastExecError.Error()
|
|
|
|
}
|
|
|
|
return APIAlertingRule{
|
|
|
|
// encode as strings to avoid rounding
|
|
|
|
ID: fmt.Sprintf("%d", ar.ID()),
|
|
|
|
GroupID: fmt.Sprintf("%d", ar.GroupID),
|
2021-02-01 14:02:44 +01:00
|
|
|
Type: ar.Type.String(),
|
2020-06-01 12:46:37 +02:00
|
|
|
Name: ar.Name,
|
|
|
|
Expression: ar.Expr,
|
|
|
|
For: ar.For.String(),
|
|
|
|
LastError: lastErr,
|
2021-08-05 08:59:46 +02:00
|
|
|
LastSamples: ar.lastExecSamples,
|
2020-06-01 12:46:37 +02:00
|
|
|
LastExec: ar.lastExecTime,
|
|
|
|
Labels: ar.Labels,
|
|
|
|
Annotations: ar.Annotations,
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// AlertsAPI generates list of APIAlert objects from existing alerts
|
|
|
|
func (ar *AlertingRule) AlertsAPI() []*APIAlert {
|
|
|
|
var alerts []*APIAlert
|
|
|
|
ar.mu.RLock()
|
|
|
|
for _, a := range ar.alerts {
|
|
|
|
alerts = append(alerts, ar.newAlertAPI(*a))
|
|
|
|
}
|
|
|
|
ar.mu.RUnlock()
|
|
|
|
return alerts
|
|
|
|
}
|
|
|
|
|
|
|
|
func (ar *AlertingRule) newAlertAPI(a notifier.Alert) *APIAlert {
|
2021-10-13 14:25:11 +02:00
|
|
|
aa := &APIAlert{
|
2020-06-01 12:46:37 +02:00
|
|
|
// encode as strings to avoid rounding
|
|
|
|
ID: fmt.Sprintf("%d", a.ID),
|
|
|
|
GroupID: fmt.Sprintf("%d", a.GroupID),
|
2021-09-07 21:39:22 +02:00
|
|
|
RuleID: fmt.Sprintf("%d", ar.RuleID),
|
2020-06-01 12:46:37 +02:00
|
|
|
|
|
|
|
Name: a.Name,
|
|
|
|
Expression: ar.Expr,
|
|
|
|
Labels: a.Labels,
|
|
|
|
Annotations: a.Annotations,
|
|
|
|
State: a.State.String(),
|
|
|
|
ActiveAt: a.Start,
|
2021-10-22 11:30:38 +02:00
|
|
|
Restored: a.Restored,
|
2021-09-07 21:39:22 +02:00
|
|
|
Value: strconv.FormatFloat(a.Value, 'f', -1, 32),
|
2020-06-01 12:46:37 +02:00
|
|
|
}
|
2021-10-13 14:25:11 +02:00
|
|
|
if alertURLGeneratorFn != nil {
|
|
|
|
aa.SourceLink = alertURLGeneratorFn(a)
|
|
|
|
}
|
|
|
|
return aa
|
2020-06-01 12:46:37 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
const (
|
2020-09-11 21:52:56 +02:00
|
|
|
// alertMetricName is the metric name for synthetic alert timeseries.
|
2020-06-01 12:46:37 +02:00
|
|
|
alertMetricName = "ALERTS"
|
2020-09-11 21:52:56 +02:00
|
|
|
// alertForStateMetricName is the metric name for 'for' state of alert.
|
2020-06-01 12:46:37 +02:00
|
|
|
alertForStateMetricName = "ALERTS_FOR_STATE"
|
|
|
|
|
2020-09-11 21:52:56 +02:00
|
|
|
// alertNameLabel is the label name indicating the name of an alert.
|
2020-06-01 12:46:37 +02:00
|
|
|
alertNameLabel = "alertname"
|
2020-09-11 21:52:56 +02:00
|
|
|
// alertStateLabel is the label name indicating the state of an alert.
|
2020-06-01 12:46:37 +02:00
|
|
|
alertStateLabel = "alertstate"
|
2020-09-11 21:52:56 +02:00
|
|
|
|
|
|
|
// alertGroupNameLabel defines the label name attached for generated time series.
|
2021-10-22 11:30:38 +02:00
|
|
|
// attaching this label may be disabled via `-disableAlertgroupLabel` flag.
|
2020-09-11 21:52:56 +02:00
|
|
|
alertGroupNameLabel = "alertgroup"
|
2020-06-01 12:46:37 +02:00
|
|
|
)
|
|
|
|
|
|
|
|
// alertToTimeSeries converts the given alert with the given timestamp to timeseries
|
2021-06-09 11:20:38 +02:00
|
|
|
func (ar *AlertingRule) alertToTimeSeries(a *notifier.Alert, timestamp int64) []prompbmarshal.TimeSeries {
|
2020-06-01 12:46:37 +02:00
|
|
|
var tss []prompbmarshal.TimeSeries
|
2021-10-22 11:30:38 +02:00
|
|
|
tss = append(tss, alertToTimeSeries(a, timestamp))
|
2020-06-01 12:46:37 +02:00
|
|
|
if ar.For > 0 {
|
2021-10-22 11:30:38 +02:00
|
|
|
tss = append(tss, alertForToTimeSeries(a, timestamp))
|
2020-06-01 12:46:37 +02:00
|
|
|
}
|
|
|
|
return tss
|
|
|
|
}
|
|
|
|
|
2021-10-22 11:30:38 +02:00
|
|
|
func alertToTimeSeries(a *notifier.Alert, timestamp int64) prompbmarshal.TimeSeries {
|
2020-06-01 12:46:37 +02:00
|
|
|
labels := make(map[string]string)
|
|
|
|
for k, v := range a.Labels {
|
|
|
|
labels[k] = v
|
|
|
|
}
|
|
|
|
labels["__name__"] = alertMetricName
|
|
|
|
labels[alertStateLabel] = a.State.String()
|
2021-06-09 11:20:38 +02:00
|
|
|
return newTimeSeries([]float64{1}, []int64{timestamp}, labels)
|
2020-06-01 12:46:37 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
// alertForToTimeSeries returns a timeseries that represents
|
|
|
|
// state of active alerts, where value is time when alert become active
|
2021-10-22 11:30:38 +02:00
|
|
|
func alertForToTimeSeries(a *notifier.Alert, timestamp int64) prompbmarshal.TimeSeries {
|
2020-06-01 12:46:37 +02:00
|
|
|
labels := make(map[string]string)
|
|
|
|
for k, v := range a.Labels {
|
|
|
|
labels[k] = v
|
|
|
|
}
|
|
|
|
labels["__name__"] = alertForStateMetricName
|
2021-06-09 11:20:38 +02:00
|
|
|
return newTimeSeries([]float64{float64(a.Start.Unix())}, []int64{timestamp}, labels)
|
2020-06-01 12:46:37 +02:00
|
|
|
}
|
|
|
|
|
2021-10-22 11:30:38 +02:00
|
|
|
// Restore restores the state of active alerts basing on previously written time series.
|
2020-06-01 12:46:37 +02:00
|
|
|
// Restore restores only Start field. Field State will be always Pending and supposed
|
|
|
|
// to be updated on next Exec, as well as Value field.
|
|
|
|
// Only rules with For > 0 will be restored.
|
2020-07-28 13:20:31 +02:00
|
|
|
func (ar *AlertingRule) Restore(ctx context.Context, q datasource.Querier, lookback time.Duration, labels map[string]string) error {
|
2020-06-01 12:46:37 +02:00
|
|
|
if q == nil {
|
|
|
|
return fmt.Errorf("querier is nil")
|
|
|
|
}
|
2020-07-28 13:20:31 +02:00
|
|
|
|
2021-04-28 22:41:15 +02:00
|
|
|
qFn := func(query string) ([]datasource.Metric, error) { return ar.q.Query(ctx, query) }
|
2020-12-14 19:11:45 +01:00
|
|
|
|
2020-07-28 13:20:31 +02:00
|
|
|
// account for external labels in filter
|
|
|
|
var labelsFilter string
|
|
|
|
for k, v := range labels {
|
|
|
|
labelsFilter += fmt.Sprintf(",%s=%q", k, v)
|
|
|
|
}
|
|
|
|
|
2020-10-30 09:18:20 +01:00
|
|
|
// Get the last data point in range via MetricsQL `last_over_time`.
|
2020-06-01 12:46:37 +02:00
|
|
|
// We don't use plain PromQL since Prometheus doesn't support
|
|
|
|
// remote write protocol which is used for state persistence in vmalert.
|
2020-07-28 13:20:31 +02:00
|
|
|
expr := fmt.Sprintf("last_over_time(%s{alertname=%q%s}[%ds])",
|
|
|
|
alertForStateMetricName, ar.Name, labelsFilter, int(lookback.Seconds()))
|
2021-04-28 22:41:15 +02:00
|
|
|
qMetrics, err := q.Query(ctx, expr)
|
2020-06-01 12:46:37 +02:00
|
|
|
if err != nil {
|
2021-05-10 10:06:31 +02:00
|
|
|
return err
|
2020-06-01 12:46:37 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
for _, m := range qMetrics {
|
2021-06-09 11:20:38 +02:00
|
|
|
a, err := ar.newAlert(m, time.Unix(int64(m.Values[0]), 0), qFn)
|
2020-06-01 12:46:37 +02:00
|
|
|
if err != nil {
|
2020-06-30 21:58:18 +02:00
|
|
|
return fmt.Errorf("failed to create alert: %w", err)
|
2020-06-01 12:46:37 +02:00
|
|
|
}
|
|
|
|
a.ID = hash(m)
|
|
|
|
a.State = notifier.StatePending
|
2021-10-22 11:30:38 +02:00
|
|
|
a.Restored = true
|
2020-06-01 12:46:37 +02:00
|
|
|
ar.alerts[a.ID] = a
|
2020-10-30 09:18:20 +01:00
|
|
|
logger.Infof("alert %q (%d) restored to state at %v", a.Name, a.ID, a.Start)
|
2020-06-01 12:46:37 +02:00
|
|
|
}
|
|
|
|
return nil
|
|
|
|
}
|