2020-05-10 18:58:17 +02:00
|
|
|
package main
|
|
|
|
|
|
|
|
import (
|
|
|
|
"context"
|
|
|
|
"fmt"
|
|
|
|
"sync"
|
|
|
|
|
2020-06-01 12:46:37 +02:00
|
|
|
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/config"
|
2020-05-10 18:58:17 +02:00
|
|
|
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
|
|
|
|
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
|
|
|
|
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/remotewrite"
|
2023-10-13 13:54:33 +02:00
|
|
|
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/rule"
|
2020-05-10 18:58:17 +02:00
|
|
|
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
|
|
|
|
)
|
|
|
|
|
2020-06-01 12:46:37 +02:00
|
|
|
// manager controls group states
|
2020-05-10 18:58:17 +02:00
|
|
|
type manager struct {
|
2021-04-28 22:41:15 +02:00
|
|
|
querierBuilder datasource.QuerierBuilder
|
2022-02-02 13:11:41 +01:00
|
|
|
notifiers func() []notifier.Notifier
|
2020-05-10 18:58:17 +02:00
|
|
|
|
2023-10-13 13:54:33 +02:00
|
|
|
rw remotewrite.RWClient
|
2021-04-28 22:41:15 +02:00
|
|
|
// remote read builder.
|
|
|
|
rr datasource.QuerierBuilder
|
2020-05-10 18:58:17 +02:00
|
|
|
|
2020-07-28 13:20:31 +02:00
|
|
|
wg sync.WaitGroup
|
|
|
|
labels map[string]string
|
2020-05-10 18:58:17 +02:00
|
|
|
|
|
|
|
groupsMu sync.RWMutex
|
2023-10-13 13:54:33 +02:00
|
|
|
groups map[uint64]*rule.Group
|
2020-05-10 18:58:17 +02:00
|
|
|
}
|
|
|
|
|
2023-10-13 13:54:33 +02:00
|
|
|
// ruleAPI generates apiRule object from alert by its ID(hash)
|
|
|
|
func (m *manager) ruleAPI(gID, rID uint64) (apiRule, error) {
|
2022-09-14 14:04:24 +02:00
|
|
|
m.groupsMu.RLock()
|
|
|
|
defer m.groupsMu.RUnlock()
|
|
|
|
|
|
|
|
g, ok := m.groups[gID]
|
|
|
|
if !ok {
|
2023-10-13 13:54:33 +02:00
|
|
|
return apiRule{}, fmt.Errorf("can't find group with id %d", gID)
|
2022-09-14 14:04:24 +02:00
|
|
|
}
|
|
|
|
for _, rule := range g.Rules {
|
|
|
|
if rule.ID() == rID {
|
2023-10-13 13:54:33 +02:00
|
|
|
return ruleToAPI(rule), nil
|
2022-09-14 14:04:24 +02:00
|
|
|
}
|
|
|
|
}
|
2023-10-13 13:54:33 +02:00
|
|
|
return apiRule{}, fmt.Errorf("can't find rule with id %d in group %q", rID, g.Name)
|
2022-09-14 14:04:24 +02:00
|
|
|
}
|
|
|
|
|
2023-10-13 13:54:33 +02:00
|
|
|
// alertAPI generates apiAlert object from alert by its ID(hash)
|
|
|
|
func (m *manager) alertAPI(gID, aID uint64) (*apiAlert, error) {
|
2020-05-10 18:58:17 +02:00
|
|
|
m.groupsMu.RLock()
|
|
|
|
defer m.groupsMu.RUnlock()
|
|
|
|
|
|
|
|
g, ok := m.groups[gID]
|
|
|
|
if !ok {
|
vmalert: fix labels and annotations processing for alerts (#2403)
To improve compatibility with Prometheus alerting the order of
templates processing has changed.
Before, vmalert did all labels processing beforehand. It meant
all extra labels (such as `alertname`, `alertgroup` or rule labels)
were available in templating. All collisions were resolved in favour
of extra labels.
In Prometheus, only labels from the received metric are available in
templating, so no collisions are possible.
This change makes vmalert's behaviour similar to Prometheus.
For example, consider alerting rule which is triggered by time series
with `alertname` label. In vmalert, this label would be overriden
by alerting rule's name everywhere: for alert labels, for annotations, etc.
In Prometheus, it would be overriden for alert's labels only, but in annotations
the original label value would be available.
See more details here https://github.com/prometheus/compliance/issues/80
Signed-off-by: hagen1778 <roman@victoriametrics.com>
2022-04-06 20:24:45 +02:00
|
|
|
return nil, fmt.Errorf("can't find group with id %d", gID)
|
2020-05-10 18:58:17 +02:00
|
|
|
}
|
2023-10-13 13:54:33 +02:00
|
|
|
for _, r := range g.Rules {
|
|
|
|
ar, ok := r.(*rule.AlertingRule)
|
2020-06-01 12:46:37 +02:00
|
|
|
if !ok {
|
|
|
|
continue
|
|
|
|
}
|
2023-10-13 13:54:33 +02:00
|
|
|
if apiAlert := alertToAPI(ar, aID); apiAlert != nil {
|
2020-05-10 18:58:17 +02:00
|
|
|
return apiAlert, nil
|
|
|
|
}
|
|
|
|
}
|
vmalert: fix labels and annotations processing for alerts (#2403)
To improve compatibility with Prometheus alerting the order of
templates processing has changed.
Before, vmalert did all labels processing beforehand. It meant
all extra labels (such as `alertname`, `alertgroup` or rule labels)
were available in templating. All collisions were resolved in favour
of extra labels.
In Prometheus, only labels from the received metric are available in
templating, so no collisions are possible.
This change makes vmalert's behaviour similar to Prometheus.
For example, consider alerting rule which is triggered by time series
with `alertname` label. In vmalert, this label would be overriden
by alerting rule's name everywhere: for alert labels, for annotations, etc.
In Prometheus, it would be overriden for alert's labels only, but in annotations
the original label value would be available.
See more details here https://github.com/prometheus/compliance/issues/80
Signed-off-by: hagen1778 <roman@victoriametrics.com>
2022-04-06 20:24:45 +02:00
|
|
|
return nil, fmt.Errorf("can't find alert with id %d in group %q", aID, g.Name)
|
2020-05-10 18:58:17 +02:00
|
|
|
}
|
|
|
|
|
2021-05-25 15:27:22 +02:00
|
|
|
func (m *manager) start(ctx context.Context, groupsCfg []config.Group) error {
|
|
|
|
return m.update(ctx, groupsCfg, true)
|
2020-05-10 18:58:17 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
func (m *manager) close() {
|
|
|
|
if m.rw != nil {
|
|
|
|
err := m.rw.Close()
|
|
|
|
if err != nil {
|
|
|
|
logger.Fatalf("cannot stop the remotewrite: %s", err)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
m.wg.Wait()
|
|
|
|
}
|
|
|
|
|
2023-10-13 13:54:33 +02:00
|
|
|
func (m *manager) startGroup(ctx context.Context, g *rule.Group, restore bool) error {
|
2023-02-04 04:46:13 +01:00
|
|
|
m.wg.Add(1)
|
|
|
|
id := g.ID()
|
|
|
|
go func() {
|
2023-03-01 15:48:20 +01:00
|
|
|
defer m.wg.Done()
|
2023-02-04 04:46:13 +01:00
|
|
|
if restore {
|
2023-10-13 13:54:33 +02:00
|
|
|
g.Start(ctx, m.notifiers, m.rw, m.rr)
|
2023-02-04 04:46:13 +01:00
|
|
|
} else {
|
2023-10-13 13:54:33 +02:00
|
|
|
g.Start(ctx, m.notifiers, m.rw, nil)
|
2023-02-04 04:46:13 +01:00
|
|
|
}
|
2020-05-10 18:58:17 +02:00
|
|
|
}()
|
2023-02-04 04:46:13 +01:00
|
|
|
m.groups[id] = g
|
2021-05-05 09:07:19 +02:00
|
|
|
return nil
|
2020-05-10 18:58:17 +02:00
|
|
|
}
|
|
|
|
|
2021-05-25 15:27:22 +02:00
|
|
|
func (m *manager) update(ctx context.Context, groupsCfg []config.Group, restore bool) error {
|
2021-11-30 00:18:48 +01:00
|
|
|
var rrPresent, arPresent bool
|
2023-10-13 13:54:33 +02:00
|
|
|
groupsRegistry := make(map[uint64]*rule.Group)
|
2020-06-01 12:46:37 +02:00
|
|
|
for _, cfg := range groupsCfg {
|
2021-11-30 00:18:48 +01:00
|
|
|
for _, r := range cfg.Rules {
|
|
|
|
if rrPresent && arPresent {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
if r.Record != "" {
|
|
|
|
rrPresent = true
|
|
|
|
}
|
|
|
|
if r.Alert != "" {
|
|
|
|
arPresent = true
|
|
|
|
}
|
|
|
|
}
|
2023-10-13 13:54:33 +02:00
|
|
|
ng := rule.NewGroup(cfg, m.querierBuilder, *evaluationInterval, m.labels)
|
2020-05-10 18:58:17 +02:00
|
|
|
groupsRegistry[ng.ID()] = ng
|
|
|
|
}
|
|
|
|
|
2021-11-30 00:18:48 +01:00
|
|
|
if rrPresent && m.rw == nil {
|
|
|
|
return fmt.Errorf("config contains recording rules but `-remoteWrite.url` isn't set")
|
|
|
|
}
|
|
|
|
if arPresent && m.notifiers == nil {
|
2023-07-18 15:06:19 +02:00
|
|
|
return fmt.Errorf("config contains alerting rules but neither `-notifier.url` nor `-notifier.config` nor `-notifier.blackhole` aren't set")
|
2021-11-30 00:18:48 +01:00
|
|
|
}
|
|
|
|
|
2020-09-11 21:14:30 +02:00
|
|
|
type updateItem struct {
|
2023-10-13 13:54:33 +02:00
|
|
|
old *rule.Group
|
|
|
|
new *rule.Group
|
2020-09-11 21:14:30 +02:00
|
|
|
}
|
|
|
|
var toUpdate []updateItem
|
|
|
|
|
2020-05-10 18:58:17 +02:00
|
|
|
m.groupsMu.Lock()
|
|
|
|
for _, og := range m.groups {
|
2020-05-17 16:12:09 +02:00
|
|
|
ng, ok := groupsRegistry[og.ID()]
|
2020-05-10 18:58:17 +02:00
|
|
|
if !ok {
|
2020-09-11 21:14:30 +02:00
|
|
|
// old group is not present in new list,
|
|
|
|
// so must be stopped and deleted
|
2023-10-13 13:54:33 +02:00
|
|
|
og.Close()
|
2020-05-10 18:58:17 +02:00
|
|
|
delete(m.groups, og.ID())
|
|
|
|
og = nil
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
delete(groupsRegistry, ng.ID())
|
2020-09-11 21:14:30 +02:00
|
|
|
if og.Checksum != ng.Checksum {
|
|
|
|
toUpdate = append(toUpdate, updateItem{old: og, new: ng})
|
|
|
|
}
|
2020-05-10 18:58:17 +02:00
|
|
|
}
|
|
|
|
for _, ng := range groupsRegistry {
|
2021-05-05 09:07:19 +02:00
|
|
|
if err := m.startGroup(ctx, ng, restore); err != nil {
|
2023-03-01 15:48:20 +01:00
|
|
|
m.groupsMu.Unlock()
|
2021-05-05 09:07:19 +02:00
|
|
|
return err
|
|
|
|
}
|
2020-05-10 18:58:17 +02:00
|
|
|
}
|
|
|
|
m.groupsMu.Unlock()
|
2020-09-11 21:14:30 +02:00
|
|
|
|
|
|
|
if len(toUpdate) > 0 {
|
|
|
|
var wg sync.WaitGroup
|
|
|
|
for _, item := range toUpdate {
|
|
|
|
wg.Add(1)
|
2024-01-26 22:42:21 +01:00
|
|
|
// cancel evaluation so the Update will be applied as fast as possible.
|
|
|
|
// it is important to call InterruptEval before the update, because cancel fn
|
|
|
|
// can be re-assigned during the update.
|
|
|
|
item.old.InterruptEval()
|
2023-10-13 13:54:33 +02:00
|
|
|
go func(old *rule.Group, new *rule.Group) {
|
|
|
|
old.UpdateWith(new)
|
2020-09-11 21:14:30 +02:00
|
|
|
wg.Done()
|
|
|
|
}(item.old, item.new)
|
|
|
|
}
|
|
|
|
wg.Wait()
|
|
|
|
}
|
2020-05-10 18:58:17 +02:00
|
|
|
return nil
|
|
|
|
}
|