2020-05-10 18:58:17 +02:00
|
|
|
package main
|
|
|
|
|
|
|
|
import (
|
|
|
|
"context"
|
|
|
|
"fmt"
|
|
|
|
"hash/fnv"
|
2020-06-01 12:46:37 +02:00
|
|
|
"sync"
|
2020-05-10 18:58:17 +02:00
|
|
|
"time"
|
|
|
|
|
2020-06-01 12:46:37 +02:00
|
|
|
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/config"
|
2020-05-10 18:58:17 +02:00
|
|
|
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
|
|
|
|
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
|
|
|
|
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/remotewrite"
|
2020-06-29 21:21:03 +02:00
|
|
|
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/utils"
|
2020-05-10 18:58:17 +02:00
|
|
|
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
|
|
|
|
"github.com/VictoriaMetrics/metrics"
|
|
|
|
)
|
|
|
|
|
|
|
|
// Group is an entity for grouping rules
|
|
|
|
type Group struct {
|
2021-08-31 13:52:34 +02:00
|
|
|
mu sync.RWMutex
|
|
|
|
Name string
|
|
|
|
File string
|
|
|
|
Rules []Rule
|
|
|
|
Type datasource.Type
|
|
|
|
Interval time.Duration
|
|
|
|
Concurrency int
|
|
|
|
Checksum string
|
|
|
|
|
2021-05-22 23:26:01 +02:00
|
|
|
ExtraFilterLabels map[string]string
|
2021-08-31 13:52:34 +02:00
|
|
|
Labels map[string]string
|
2020-05-10 18:58:17 +02:00
|
|
|
|
2020-05-17 16:12:09 +02:00
|
|
|
doneCh chan struct{}
|
|
|
|
finishedCh chan struct{}
|
|
|
|
// channel accepts new Group obj
|
|
|
|
// which supposed to update current group
|
2020-06-01 12:46:37 +02:00
|
|
|
updateCh chan *Group
|
app/vmalert: extend metrics set exported by `vmalert` #573 (#654)
* app/vmalert: extend metrics set exported by `vmalert` #573
New metrics were added to improve observability:
+ vmalert_alerts_pending{alertname, group} - number of pending alerts per group
per alert;
+ vmalert_alerts_acitve{alertname, group} - number of active alerts per group
per alert;
+ vmalert_alerts_error{alertname, group} - is 1 if alertname ended up with error
during prev execution, is 0 if no errors happened;
+ vmalert_recording_rules_error{recording, group} - is 1 if recording rule
ended up with error during prev execution, is 0 if no errors happened;
* vmalert_iteration_total{group, file} - now contains group and file name labels.
This should improve control over specific groups;
* vmalert_iteration_duration_seconds{group, file} - now contains group and file name labels. This should improve control over specific groups;
Some collisions for alerts and recording rules are possible, because neither
group name nor alert/recording rule name are unique for compatibility reasons.
Commit contains list of TODOs for Unregistering metrics since groups and rules
are ephemeral and could be removed without application restart. In order to
unlock Unregistering feature corresponding PR was filed - https://github.com/VictoriaMetrics/metrics/pull/13
* app/vmalert: extend metrics set exported by `vmalert` #573
The changes are following:
* add an ID label to rules metrics, since `name` collisions within one group is
a common case - see the k8s example alerts;
* supports metrics unregistering on rule updates. Consider the case when one rule
was added or removed from the group, or the whole group was added or removed.
The change depends on https://github.com/VictoriaMetrics/metrics/pull/16
where race condition for Unregister method was fixed.
2020-08-09 08:41:29 +02:00
|
|
|
|
|
|
|
metrics *groupMetrics
|
|
|
|
}
|
|
|
|
|
|
|
|
type groupMetrics struct {
|
|
|
|
iterationTotal *counter
|
|
|
|
iterationDuration *summary
|
|
|
|
}
|
|
|
|
|
|
|
|
func newGroupMetrics(name, file string) *groupMetrics {
|
|
|
|
m := &groupMetrics{}
|
|
|
|
labels := fmt.Sprintf(`group=%q, file=%q`, name, file)
|
|
|
|
m.iterationTotal = getOrCreateCounter(fmt.Sprintf(`vmalert_iteration_total{%s}`, labels))
|
|
|
|
m.iterationDuration = getOrCreateSummary(fmt.Sprintf(`vmalert_iteration_duration_seconds{%s}`, labels))
|
|
|
|
return m
|
2020-06-01 12:46:37 +02:00
|
|
|
}
|
|
|
|
|
2021-08-31 13:52:34 +02:00
|
|
|
// merges group rule labels into result map
|
|
|
|
// set2 has priority over set1.
|
|
|
|
func mergeLabels(groupName, ruleName string, set1, set2 map[string]string) map[string]string {
|
|
|
|
r := map[string]string{}
|
|
|
|
for k, v := range set1 {
|
|
|
|
r[k] = v
|
|
|
|
}
|
|
|
|
for k, v := range set2 {
|
|
|
|
if prevV, ok := r[k]; ok {
|
|
|
|
logger.Infof("label %q=%q for rule %q.%q overwritten with external label %q=%q",
|
|
|
|
k, prevV, groupName, ruleName, k, v)
|
|
|
|
}
|
|
|
|
r[k] = v
|
|
|
|
}
|
|
|
|
return r
|
|
|
|
}
|
|
|
|
|
2021-04-28 22:41:15 +02:00
|
|
|
func newGroup(cfg config.Group, qb datasource.QuerierBuilder, defaultInterval time.Duration, labels map[string]string) *Group {
|
2020-06-01 12:46:37 +02:00
|
|
|
g := &Group{
|
2021-05-22 23:26:01 +02:00
|
|
|
Type: cfg.Type,
|
|
|
|
Name: cfg.Name,
|
|
|
|
File: cfg.File,
|
2021-07-12 11:34:10 +02:00
|
|
|
Interval: cfg.Interval.Duration(),
|
2021-05-22 23:26:01 +02:00
|
|
|
Concurrency: cfg.Concurrency,
|
|
|
|
Checksum: cfg.Checksum,
|
|
|
|
ExtraFilterLabels: cfg.ExtraFilterLabels,
|
2021-08-31 13:52:34 +02:00
|
|
|
Labels: cfg.Labels,
|
2021-05-22 23:26:01 +02:00
|
|
|
|
|
|
|
doneCh: make(chan struct{}),
|
|
|
|
finishedCh: make(chan struct{}),
|
|
|
|
updateCh: make(chan *Group),
|
2020-06-01 12:46:37 +02:00
|
|
|
}
|
app/vmalert: extend metrics set exported by `vmalert` #573 (#654)
* app/vmalert: extend metrics set exported by `vmalert` #573
New metrics were added to improve observability:
+ vmalert_alerts_pending{alertname, group} - number of pending alerts per group
per alert;
+ vmalert_alerts_acitve{alertname, group} - number of active alerts per group
per alert;
+ vmalert_alerts_error{alertname, group} - is 1 if alertname ended up with error
during prev execution, is 0 if no errors happened;
+ vmalert_recording_rules_error{recording, group} - is 1 if recording rule
ended up with error during prev execution, is 0 if no errors happened;
* vmalert_iteration_total{group, file} - now contains group and file name labels.
This should improve control over specific groups;
* vmalert_iteration_duration_seconds{group, file} - now contains group and file name labels. This should improve control over specific groups;
Some collisions for alerts and recording rules are possible, because neither
group name nor alert/recording rule name are unique for compatibility reasons.
Commit contains list of TODOs for Unregistering metrics since groups and rules
are ephemeral and could be removed without application restart. In order to
unlock Unregistering feature corresponding PR was filed - https://github.com/VictoriaMetrics/metrics/pull/13
* app/vmalert: extend metrics set exported by `vmalert` #573
The changes are following:
* add an ID label to rules metrics, since `name` collisions within one group is
a common case - see the k8s example alerts;
* supports metrics unregistering on rule updates. Consider the case when one rule
was added or removed from the group, or the whole group was added or removed.
The change depends on https://github.com/VictoriaMetrics/metrics/pull/16
where race condition for Unregister method was fixed.
2020-08-09 08:41:29 +02:00
|
|
|
g.metrics = newGroupMetrics(g.Name, g.File)
|
2020-06-01 12:46:37 +02:00
|
|
|
if g.Interval == 0 {
|
|
|
|
g.Interval = defaultInterval
|
|
|
|
}
|
2020-06-09 14:21:20 +02:00
|
|
|
if g.Concurrency < 1 {
|
|
|
|
g.Concurrency = 1
|
|
|
|
}
|
2020-06-01 12:46:37 +02:00
|
|
|
rules := make([]Rule, len(cfg.Rules))
|
|
|
|
for i, r := range cfg.Rules {
|
2021-08-31 13:52:34 +02:00
|
|
|
var extraLabels map[string]string
|
|
|
|
// apply external labels
|
|
|
|
if len(labels) > 0 {
|
|
|
|
extraLabels = labels
|
|
|
|
}
|
|
|
|
// apply group labels, it has priority on external labels
|
|
|
|
if len(cfg.Labels) > 0 {
|
|
|
|
extraLabels = mergeLabels(g.Name, r.Name(), extraLabels, g.Labels)
|
2020-07-28 13:20:31 +02:00
|
|
|
}
|
2021-08-31 13:52:34 +02:00
|
|
|
// apply rules labels, it has priority on other labels
|
|
|
|
if len(extraLabels) > 0 {
|
|
|
|
r.Labels = mergeLabels(g.Name, r.Name(), extraLabels, r.Labels)
|
|
|
|
}
|
|
|
|
|
2021-04-28 22:41:15 +02:00
|
|
|
rules[i] = g.newRule(qb, r)
|
2020-06-01 12:46:37 +02:00
|
|
|
}
|
|
|
|
g.Rules = rules
|
|
|
|
return g
|
|
|
|
}
|
|
|
|
|
2021-04-28 22:41:15 +02:00
|
|
|
func (g *Group) newRule(qb datasource.QuerierBuilder, rule config.Rule) Rule {
|
2020-06-01 12:46:37 +02:00
|
|
|
if rule.Alert != "" {
|
2021-04-28 22:41:15 +02:00
|
|
|
return newAlertingRule(qb, g, rule)
|
2020-06-01 12:46:37 +02:00
|
|
|
}
|
2021-04-28 22:41:15 +02:00
|
|
|
return newRecordingRule(qb, g, rule)
|
2020-05-10 18:58:17 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
// ID return unique group ID that consists of
|
|
|
|
// rules file and group name
|
2020-05-17 16:12:09 +02:00
|
|
|
func (g *Group) ID() uint64 {
|
2021-10-19 15:44:13 +02:00
|
|
|
g.mu.RLock()
|
|
|
|
defer g.mu.RUnlock()
|
|
|
|
|
2020-05-10 18:58:17 +02:00
|
|
|
hash := fnv.New64a()
|
|
|
|
hash.Write([]byte(g.File))
|
|
|
|
hash.Write([]byte("\xff"))
|
|
|
|
hash.Write([]byte(g.Name))
|
2021-02-01 14:02:44 +01:00
|
|
|
hash.Write([]byte(g.Type.Get()))
|
2020-05-10 18:58:17 +02:00
|
|
|
return hash.Sum64()
|
|
|
|
}
|
|
|
|
|
2020-06-01 12:46:37 +02:00
|
|
|
// Restore restores alerts state for group rules
|
2021-04-28 22:41:15 +02:00
|
|
|
func (g *Group) Restore(ctx context.Context, qb datasource.QuerierBuilder, lookback time.Duration, labels map[string]string) error {
|
2021-08-31 13:52:34 +02:00
|
|
|
labels = mergeLabels(g.Name, "", labels, g.Labels)
|
2020-05-10 18:58:17 +02:00
|
|
|
for _, rule := range g.Rules {
|
2020-06-01 12:46:37 +02:00
|
|
|
rr, ok := rule.(*AlertingRule)
|
|
|
|
if !ok {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
if rr.For < 1 {
|
|
|
|
continue
|
2020-05-10 18:58:17 +02:00
|
|
|
}
|
2021-05-22 23:26:01 +02:00
|
|
|
// ignore g.ExtraFilterLabels on purpose, so it
|
|
|
|
// won't affect the restore procedure.
|
2021-04-28 22:41:15 +02:00
|
|
|
q := qb.BuildWithParams(datasource.QuerierParams{})
|
2020-07-28 13:20:31 +02:00
|
|
|
if err := rr.Restore(ctx, q, lookback, labels); err != nil {
|
2020-06-30 21:58:18 +02:00
|
|
|
return fmt.Errorf("error while restoring rule %q: %w", rule, err)
|
2020-05-10 18:58:17 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// updateWith updates existing group with
|
2020-06-01 12:46:37 +02:00
|
|
|
// passed group object. This function ignores group
|
|
|
|
// evaluation interval change. It supposed to be updated
|
|
|
|
// in group.start function.
|
2020-05-17 16:12:09 +02:00
|
|
|
// Not thread-safe.
|
2020-06-01 12:46:37 +02:00
|
|
|
func (g *Group) updateWith(newGroup *Group) error {
|
|
|
|
rulesRegistry := make(map[uint64]Rule)
|
2020-05-10 18:58:17 +02:00
|
|
|
for _, nr := range newGroup.Rules {
|
2020-06-01 12:46:37 +02:00
|
|
|
rulesRegistry[nr.ID()] = nr
|
2020-05-10 18:58:17 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
for i, or := range g.Rules {
|
2020-06-01 12:46:37 +02:00
|
|
|
nr, ok := rulesRegistry[or.ID()]
|
2020-05-10 18:58:17 +02:00
|
|
|
if !ok {
|
|
|
|
// old rule is not present in the new list
|
2020-05-15 08:55:22 +02:00
|
|
|
// so we mark it for removing
|
app/vmalert: extend metrics set exported by `vmalert` #573 (#654)
* app/vmalert: extend metrics set exported by `vmalert` #573
New metrics were added to improve observability:
+ vmalert_alerts_pending{alertname, group} - number of pending alerts per group
per alert;
+ vmalert_alerts_acitve{alertname, group} - number of active alerts per group
per alert;
+ vmalert_alerts_error{alertname, group} - is 1 if alertname ended up with error
during prev execution, is 0 if no errors happened;
+ vmalert_recording_rules_error{recording, group} - is 1 if recording rule
ended up with error during prev execution, is 0 if no errors happened;
* vmalert_iteration_total{group, file} - now contains group and file name labels.
This should improve control over specific groups;
* vmalert_iteration_duration_seconds{group, file} - now contains group and file name labels. This should improve control over specific groups;
Some collisions for alerts and recording rules are possible, because neither
group name nor alert/recording rule name are unique for compatibility reasons.
Commit contains list of TODOs for Unregistering metrics since groups and rules
are ephemeral and could be removed without application restart. In order to
unlock Unregistering feature corresponding PR was filed - https://github.com/VictoriaMetrics/metrics/pull/13
* app/vmalert: extend metrics set exported by `vmalert` #573
The changes are following:
* add an ID label to rules metrics, since `name` collisions within one group is
a common case - see the k8s example alerts;
* supports metrics unregistering on rule updates. Consider the case when one rule
was added or removed from the group, or the whole group was added or removed.
The change depends on https://github.com/VictoriaMetrics/metrics/pull/16
where race condition for Unregister method was fixed.
2020-08-09 08:41:29 +02:00
|
|
|
g.Rules[i].Close()
|
2020-05-15 08:55:22 +02:00
|
|
|
g.Rules[i] = nil
|
2020-05-10 18:58:17 +02:00
|
|
|
continue
|
|
|
|
}
|
2020-06-01 12:46:37 +02:00
|
|
|
if err := or.UpdateWith(nr); err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
delete(rulesRegistry, nr.ID())
|
2020-05-10 18:58:17 +02:00
|
|
|
}
|
|
|
|
|
2020-06-01 12:46:37 +02:00
|
|
|
var newRules []Rule
|
2020-05-15 08:55:22 +02:00
|
|
|
for _, r := range g.Rules {
|
|
|
|
if r == nil {
|
|
|
|
// skip nil rules
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
newRules = append(newRules, r)
|
|
|
|
}
|
|
|
|
// add the rest of rules from registry
|
2020-05-10 18:58:17 +02:00
|
|
|
for _, nr := range rulesRegistry {
|
2020-05-15 08:55:22 +02:00
|
|
|
newRules = append(newRules, nr)
|
2020-05-10 18:58:17 +02:00
|
|
|
}
|
2021-09-23 16:55:59 +02:00
|
|
|
// note that g.Interval is not updated here
|
|
|
|
// so the value can be compared later in
|
|
|
|
// group.Start function
|
2021-02-01 14:02:44 +01:00
|
|
|
g.Type = newGroup.Type
|
2020-06-09 14:21:20 +02:00
|
|
|
g.Concurrency = newGroup.Concurrency
|
2021-05-22 23:26:01 +02:00
|
|
|
g.ExtraFilterLabels = newGroup.ExtraFilterLabels
|
2021-08-31 13:52:34 +02:00
|
|
|
g.Labels = newGroup.Labels
|
2020-09-11 21:14:30 +02:00
|
|
|
g.Checksum = newGroup.Checksum
|
2020-05-15 08:55:22 +02:00
|
|
|
g.Rules = newRules
|
2020-06-01 12:46:37 +02:00
|
|
|
return nil
|
2020-05-10 18:58:17 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
func (g *Group) close() {
|
2020-05-17 16:12:09 +02:00
|
|
|
if g.doneCh == nil {
|
2020-05-10 18:58:17 +02:00
|
|
|
return
|
|
|
|
}
|
2020-05-17 16:12:09 +02:00
|
|
|
close(g.doneCh)
|
|
|
|
<-g.finishedCh
|
app/vmalert: extend metrics set exported by `vmalert` #573 (#654)
* app/vmalert: extend metrics set exported by `vmalert` #573
New metrics were added to improve observability:
+ vmalert_alerts_pending{alertname, group} - number of pending alerts per group
per alert;
+ vmalert_alerts_acitve{alertname, group} - number of active alerts per group
per alert;
+ vmalert_alerts_error{alertname, group} - is 1 if alertname ended up with error
during prev execution, is 0 if no errors happened;
+ vmalert_recording_rules_error{recording, group} - is 1 if recording rule
ended up with error during prev execution, is 0 if no errors happened;
* vmalert_iteration_total{group, file} - now contains group and file name labels.
This should improve control over specific groups;
* vmalert_iteration_duration_seconds{group, file} - now contains group and file name labels. This should improve control over specific groups;
Some collisions for alerts and recording rules are possible, because neither
group name nor alert/recording rule name are unique for compatibility reasons.
Commit contains list of TODOs for Unregistering metrics since groups and rules
are ephemeral and could be removed without application restart. In order to
unlock Unregistering feature corresponding PR was filed - https://github.com/VictoriaMetrics/metrics/pull/13
* app/vmalert: extend metrics set exported by `vmalert` #573
The changes are following:
* add an ID label to rules metrics, since `name` collisions within one group is
a common case - see the k8s example alerts;
* supports metrics unregistering on rule updates. Consider the case when one rule
was added or removed from the group, or the whole group was added or removed.
The change depends on https://github.com/VictoriaMetrics/metrics/pull/16
where race condition for Unregister method was fixed.
2020-08-09 08:41:29 +02:00
|
|
|
|
|
|
|
metrics.UnregisterMetric(g.metrics.iterationDuration.name)
|
|
|
|
metrics.UnregisterMetric(g.metrics.iterationTotal.name)
|
|
|
|
for _, rule := range g.Rules {
|
|
|
|
rule.Close()
|
|
|
|
}
|
2020-05-10 18:58:17 +02:00
|
|
|
}
|
|
|
|
|
2020-09-03 00:00:55 +02:00
|
|
|
var skipRandSleepOnGroupStart bool
|
|
|
|
|
2021-04-28 22:41:15 +02:00
|
|
|
func (g *Group) start(ctx context.Context, nts []notifier.Notifier, rw *remotewrite.Client) {
|
2020-06-09 14:21:20 +02:00
|
|
|
defer func() { close(g.finishedCh) }()
|
2020-09-03 00:00:55 +02:00
|
|
|
|
|
|
|
// Spread group rules evaluation over time in order to reduce load on VictoriaMetrics.
|
|
|
|
if !skipRandSleepOnGroupStart {
|
2021-10-27 18:59:13 +02:00
|
|
|
randSleep := uint64(float64(g.Interval) * (float64(g.ID()) / (1 << 64)))
|
2020-09-03 00:00:55 +02:00
|
|
|
sleepOffset := uint64(time.Now().UnixNano()) % uint64(g.Interval)
|
|
|
|
if randSleep < sleepOffset {
|
|
|
|
randSleep += uint64(g.Interval)
|
|
|
|
}
|
|
|
|
randSleep -= sleepOffset
|
|
|
|
sleepTimer := time.NewTimer(time.Duration(randSleep))
|
|
|
|
select {
|
|
|
|
case <-ctx.Done():
|
|
|
|
sleepTimer.Stop()
|
|
|
|
return
|
|
|
|
case <-g.doneCh:
|
|
|
|
sleepTimer.Stop()
|
|
|
|
return
|
|
|
|
case <-sleepTimer.C:
|
|
|
|
}
|
2020-09-02 23:58:54 +02:00
|
|
|
}
|
2020-09-03 00:00:55 +02:00
|
|
|
|
|
|
|
logger.Infof("group %q started; interval=%v; concurrency=%d", g.Name, g.Interval, g.Concurrency)
|
2021-08-31 11:28:02 +02:00
|
|
|
e := &executor{rw: rw}
|
|
|
|
for _, nt := range nts {
|
|
|
|
ent := eNotifier{
|
|
|
|
Notifier: nt,
|
|
|
|
alertsSent: getOrCreateCounter(fmt.Sprintf("vmalert_alerts_sent_total{addr=%q}", nt.Addr())),
|
|
|
|
alertsSendErrors: getOrCreateCounter(fmt.Sprintf("vmalert_alerts_send_errors_total{addr=%q}", nt.Addr())),
|
|
|
|
}
|
|
|
|
e.notifiers = append(e.notifiers, ent)
|
|
|
|
}
|
|
|
|
|
2020-06-01 12:46:37 +02:00
|
|
|
t := time.NewTicker(g.Interval)
|
2020-05-10 18:58:17 +02:00
|
|
|
defer t.Stop()
|
|
|
|
for {
|
|
|
|
select {
|
|
|
|
case <-ctx.Done():
|
|
|
|
logger.Infof("group %q: context cancelled", g.Name)
|
|
|
|
return
|
2020-05-17 16:12:09 +02:00
|
|
|
case <-g.doneCh:
|
2020-05-10 18:58:17 +02:00
|
|
|
logger.Infof("group %q: received stop signal", g.Name)
|
|
|
|
return
|
2020-05-17 16:12:09 +02:00
|
|
|
case ng := <-g.updateCh:
|
2020-06-01 12:46:37 +02:00
|
|
|
g.mu.Lock()
|
|
|
|
err := g.updateWith(ng)
|
|
|
|
if err != nil {
|
|
|
|
logger.Errorf("group %q: failed to update: %s", g.Name, err)
|
|
|
|
g.mu.Unlock()
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
if g.Interval != ng.Interval {
|
|
|
|
g.Interval = ng.Interval
|
|
|
|
t.Stop()
|
|
|
|
t = time.NewTicker(g.Interval)
|
|
|
|
}
|
|
|
|
g.mu.Unlock()
|
2020-06-09 14:21:20 +02:00
|
|
|
logger.Infof("group %q re-started; interval=%v; concurrency=%d", g.Name, g.Interval, g.Concurrency)
|
2020-05-10 18:58:17 +02:00
|
|
|
case <-t.C:
|
app/vmalert: extend metrics set exported by `vmalert` #573 (#654)
* app/vmalert: extend metrics set exported by `vmalert` #573
New metrics were added to improve observability:
+ vmalert_alerts_pending{alertname, group} - number of pending alerts per group
per alert;
+ vmalert_alerts_acitve{alertname, group} - number of active alerts per group
per alert;
+ vmalert_alerts_error{alertname, group} - is 1 if alertname ended up with error
during prev execution, is 0 if no errors happened;
+ vmalert_recording_rules_error{recording, group} - is 1 if recording rule
ended up with error during prev execution, is 0 if no errors happened;
* vmalert_iteration_total{group, file} - now contains group and file name labels.
This should improve control over specific groups;
* vmalert_iteration_duration_seconds{group, file} - now contains group and file name labels. This should improve control over specific groups;
Some collisions for alerts and recording rules are possible, because neither
group name nor alert/recording rule name are unique for compatibility reasons.
Commit contains list of TODOs for Unregistering metrics since groups and rules
are ephemeral and could be removed without application restart. In order to
unlock Unregistering feature corresponding PR was filed - https://github.com/VictoriaMetrics/metrics/pull/13
* app/vmalert: extend metrics set exported by `vmalert` #573
The changes are following:
* add an ID label to rules metrics, since `name` collisions within one group is
a common case - see the k8s example alerts;
* supports metrics unregistering on rule updates. Consider the case when one rule
was added or removed from the group, or the whole group was added or removed.
The change depends on https://github.com/VictoriaMetrics/metrics/pull/16
where race condition for Unregister method was fixed.
2020-08-09 08:41:29 +02:00
|
|
|
g.metrics.iterationTotal.Inc()
|
2020-05-10 18:58:17 +02:00
|
|
|
iterationStart := time.Now()
|
2021-10-25 11:15:02 +02:00
|
|
|
if len(g.Rules) > 0 {
|
|
|
|
resolveDuration := getResolveDuration(g.Interval)
|
|
|
|
errs := e.execConcurrently(ctx, g.Rules, g.Concurrency, resolveDuration)
|
|
|
|
for err := range errs {
|
|
|
|
if err != nil {
|
|
|
|
logger.Errorf("group %q: %s", g.Name, err)
|
|
|
|
}
|
2020-05-10 18:58:17 +02:00
|
|
|
}
|
2020-06-09 14:21:20 +02:00
|
|
|
}
|
app/vmalert: extend metrics set exported by `vmalert` #573 (#654)
* app/vmalert: extend metrics set exported by `vmalert` #573
New metrics were added to improve observability:
+ vmalert_alerts_pending{alertname, group} - number of pending alerts per group
per alert;
+ vmalert_alerts_acitve{alertname, group} - number of active alerts per group
per alert;
+ vmalert_alerts_error{alertname, group} - is 1 if alertname ended up with error
during prev execution, is 0 if no errors happened;
+ vmalert_recording_rules_error{recording, group} - is 1 if recording rule
ended up with error during prev execution, is 0 if no errors happened;
* vmalert_iteration_total{group, file} - now contains group and file name labels.
This should improve control over specific groups;
* vmalert_iteration_duration_seconds{group, file} - now contains group and file name labels. This should improve control over specific groups;
Some collisions for alerts and recording rules are possible, because neither
group name nor alert/recording rule name are unique for compatibility reasons.
Commit contains list of TODOs for Unregistering metrics since groups and rules
are ephemeral and could be removed without application restart. In order to
unlock Unregistering feature corresponding PR was filed - https://github.com/VictoriaMetrics/metrics/pull/13
* app/vmalert: extend metrics set exported by `vmalert` #573
The changes are following:
* add an ID label to rules metrics, since `name` collisions within one group is
a common case - see the k8s example alerts;
* supports metrics unregistering on rule updates. Consider the case when one rule
was added or removed from the group, or the whole group was added or removed.
The change depends on https://github.com/VictoriaMetrics/metrics/pull/16
where race condition for Unregister method was fixed.
2020-08-09 08:41:29 +02:00
|
|
|
g.metrics.iterationDuration.UpdateDuration(iterationStart)
|
2020-06-09 14:21:20 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2020-06-01 12:46:37 +02:00
|
|
|
|
2021-09-13 14:48:18 +02:00
|
|
|
// resolveDuration for alerts is equal to 3 interval evaluations
|
|
|
|
// so in case if vmalert stops sending updates for some reason,
|
|
|
|
// notifier could automatically resolve the alert.
|
|
|
|
func getResolveDuration(groupInterval time.Duration) time.Duration {
|
|
|
|
resolveInterval := groupInterval * 3
|
|
|
|
if *maxResolveDuration > 0 && (resolveInterval > *maxResolveDuration) {
|
|
|
|
return *maxResolveDuration
|
|
|
|
}
|
|
|
|
return resolveInterval
|
|
|
|
}
|
|
|
|
|
2020-06-09 14:21:20 +02:00
|
|
|
type executor struct {
|
2021-08-31 11:28:02 +02:00
|
|
|
notifiers []eNotifier
|
2020-06-29 21:21:03 +02:00
|
|
|
rw *remotewrite.Client
|
2020-06-09 14:21:20 +02:00
|
|
|
}
|
|
|
|
|
2021-08-31 11:28:02 +02:00
|
|
|
type eNotifier struct {
|
|
|
|
notifier.Notifier
|
|
|
|
alertsSent *counter
|
|
|
|
alertsSendErrors *counter
|
|
|
|
}
|
|
|
|
|
2021-09-13 14:48:18 +02:00
|
|
|
func (e *executor) execConcurrently(ctx context.Context, rules []Rule, concurrency int, resolveDuration time.Duration) chan error {
|
2020-06-09 14:21:20 +02:00
|
|
|
res := make(chan error, len(rules))
|
|
|
|
if concurrency == 1 {
|
|
|
|
// fast path
|
|
|
|
for _, rule := range rules {
|
2021-09-13 14:48:18 +02:00
|
|
|
res <- e.exec(ctx, rule, resolveDuration)
|
2020-06-09 14:21:20 +02:00
|
|
|
}
|
|
|
|
close(res)
|
|
|
|
return res
|
|
|
|
}
|
|
|
|
|
|
|
|
sem := make(chan struct{}, concurrency)
|
|
|
|
go func() {
|
|
|
|
wg := sync.WaitGroup{}
|
|
|
|
for _, rule := range rules {
|
|
|
|
sem <- struct{}{}
|
|
|
|
wg.Add(1)
|
|
|
|
go func(r Rule) {
|
2021-09-13 14:48:18 +02:00
|
|
|
res <- e.exec(ctx, r, resolveDuration)
|
2020-06-09 14:21:20 +02:00
|
|
|
<-sem
|
|
|
|
wg.Done()
|
|
|
|
}(rule)
|
|
|
|
}
|
|
|
|
wg.Wait()
|
|
|
|
close(res)
|
|
|
|
}()
|
|
|
|
return res
|
|
|
|
}
|
|
|
|
|
app/vmalert: extend metrics set exported by `vmalert` #573 (#654)
* app/vmalert: extend metrics set exported by `vmalert` #573
New metrics were added to improve observability:
+ vmalert_alerts_pending{alertname, group} - number of pending alerts per group
per alert;
+ vmalert_alerts_acitve{alertname, group} - number of active alerts per group
per alert;
+ vmalert_alerts_error{alertname, group} - is 1 if alertname ended up with error
during prev execution, is 0 if no errors happened;
+ vmalert_recording_rules_error{recording, group} - is 1 if recording rule
ended up with error during prev execution, is 0 if no errors happened;
* vmalert_iteration_total{group, file} - now contains group and file name labels.
This should improve control over specific groups;
* vmalert_iteration_duration_seconds{group, file} - now contains group and file name labels. This should improve control over specific groups;
Some collisions for alerts and recording rules are possible, because neither
group name nor alert/recording rule name are unique for compatibility reasons.
Commit contains list of TODOs for Unregistering metrics since groups and rules
are ephemeral and could be removed without application restart. In order to
unlock Unregistering feature corresponding PR was filed - https://github.com/VictoriaMetrics/metrics/pull/13
* app/vmalert: extend metrics set exported by `vmalert` #573
The changes are following:
* add an ID label to rules metrics, since `name` collisions within one group is
a common case - see the k8s example alerts;
* supports metrics unregistering on rule updates. Consider the case when one rule
was added or removed from the group, or the whole group was added or removed.
The change depends on https://github.com/VictoriaMetrics/metrics/pull/16
where race condition for Unregister method was fixed.
2020-08-09 08:41:29 +02:00
|
|
|
var (
|
2021-08-31 11:28:02 +02:00
|
|
|
alertsFired = metrics.NewCounter(`vmalert_alerts_fired_total`)
|
|
|
|
|
|
|
|
execTotal = metrics.NewCounter(`vmalert_execution_total`)
|
|
|
|
execErrors = metrics.NewCounter(`vmalert_execution_errors_total`)
|
app/vmalert: extend metrics set exported by `vmalert` #573 (#654)
* app/vmalert: extend metrics set exported by `vmalert` #573
New metrics were added to improve observability:
+ vmalert_alerts_pending{alertname, group} - number of pending alerts per group
per alert;
+ vmalert_alerts_acitve{alertname, group} - number of active alerts per group
per alert;
+ vmalert_alerts_error{alertname, group} - is 1 if alertname ended up with error
during prev execution, is 0 if no errors happened;
+ vmalert_recording_rules_error{recording, group} - is 1 if recording rule
ended up with error during prev execution, is 0 if no errors happened;
* vmalert_iteration_total{group, file} - now contains group and file name labels.
This should improve control over specific groups;
* vmalert_iteration_duration_seconds{group, file} - now contains group and file name labels. This should improve control over specific groups;
Some collisions for alerts and recording rules are possible, because neither
group name nor alert/recording rule name are unique for compatibility reasons.
Commit contains list of TODOs for Unregistering metrics since groups and rules
are ephemeral and could be removed without application restart. In order to
unlock Unregistering feature corresponding PR was filed - https://github.com/VictoriaMetrics/metrics/pull/13
* app/vmalert: extend metrics set exported by `vmalert` #573
The changes are following:
* add an ID label to rules metrics, since `name` collisions within one group is
a common case - see the k8s example alerts;
* supports metrics unregistering on rule updates. Consider the case when one rule
was added or removed from the group, or the whole group was added or removed.
The change depends on https://github.com/VictoriaMetrics/metrics/pull/16
where race condition for Unregister method was fixed.
2020-08-09 08:41:29 +02:00
|
|
|
|
|
|
|
remoteWriteErrors = metrics.NewCounter(`vmalert_remotewrite_errors_total`)
|
|
|
|
)
|
|
|
|
|
2021-09-13 14:48:18 +02:00
|
|
|
func (e *executor) exec(ctx context.Context, rule Rule, resolveDuration time.Duration) error {
|
2020-06-09 14:21:20 +02:00
|
|
|
execTotal.Inc()
|
|
|
|
|
2021-06-09 11:20:38 +02:00
|
|
|
tss, err := rule.Exec(ctx)
|
2020-06-09 14:21:20 +02:00
|
|
|
if err != nil {
|
|
|
|
execErrors.Inc()
|
2020-06-30 21:58:18 +02:00
|
|
|
return fmt.Errorf("rule %q: failed to execute: %w", rule, err)
|
2020-06-09 14:21:20 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
if len(tss) > 0 && e.rw != nil {
|
|
|
|
for _, ts := range tss {
|
|
|
|
if err := e.rw.Push(ts); err != nil {
|
|
|
|
remoteWriteErrors.Inc()
|
2020-06-30 21:58:18 +02:00
|
|
|
return fmt.Errorf("rule %q: remote write failure: %w", rule, err)
|
2020-05-10 18:58:17 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2020-06-09 14:21:20 +02:00
|
|
|
|
|
|
|
ar, ok := rule.(*AlertingRule)
|
|
|
|
if !ok {
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
var alerts []notifier.Alert
|
|
|
|
for _, a := range ar.alerts {
|
|
|
|
switch a.State {
|
|
|
|
case notifier.StateFiring:
|
2021-09-13 14:48:18 +02:00
|
|
|
a.End = time.Now().Add(resolveDuration)
|
2020-06-09 14:21:20 +02:00
|
|
|
alerts = append(alerts, *a)
|
|
|
|
case notifier.StateInactive:
|
|
|
|
// set End to execStart to notify
|
|
|
|
// that it was just resolved
|
|
|
|
a.End = time.Now()
|
|
|
|
alerts = append(alerts, *a)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if len(alerts) < 1 {
|
|
|
|
return nil
|
|
|
|
}
|
2020-06-29 21:21:03 +02:00
|
|
|
|
|
|
|
errGr := new(utils.ErrGroup)
|
|
|
|
for _, nt := range e.notifiers {
|
2021-08-31 11:28:02 +02:00
|
|
|
nt.alertsSent.Add(len(alerts))
|
2020-06-29 21:21:03 +02:00
|
|
|
if err := nt.Send(ctx, alerts); err != nil {
|
2021-08-31 11:28:02 +02:00
|
|
|
nt.alertsSendErrors.Inc()
|
2020-06-30 21:58:18 +02:00
|
|
|
errGr.Add(fmt.Errorf("rule %q: failed to send alerts: %w", rule, err))
|
2020-06-29 21:21:03 +02:00
|
|
|
}
|
2020-06-09 14:21:20 +02:00
|
|
|
}
|
2020-06-29 21:21:03 +02:00
|
|
|
return errGr.Err()
|
2020-05-10 18:58:17 +02:00
|
|
|
}
|