2020-05-10 18:58:17 +02:00
|
|
|
package main
|
|
|
|
|
|
|
|
import (
|
|
|
|
"context"
|
|
|
|
"fmt"
|
|
|
|
"sync"
|
|
|
|
|
2020-06-01 12:46:37 +02:00
|
|
|
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/config"
|
2020-05-10 18:58:17 +02:00
|
|
|
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/datasource"
|
|
|
|
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
|
|
|
|
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/remotewrite"
|
|
|
|
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
|
|
|
|
)
|
|
|
|
|
2020-06-01 12:46:37 +02:00
|
|
|
// manager controls group states
|
2020-05-10 18:58:17 +02:00
|
|
|
type manager struct {
|
2021-04-28 22:41:15 +02:00
|
|
|
querierBuilder datasource.QuerierBuilder
|
|
|
|
notifiers []notifier.Notifier
|
2020-05-10 18:58:17 +02:00
|
|
|
|
|
|
|
rw *remotewrite.Client
|
2021-04-28 22:41:15 +02:00
|
|
|
// remote read builder.
|
|
|
|
rr datasource.QuerierBuilder
|
2020-05-10 18:58:17 +02:00
|
|
|
|
2020-07-28 13:20:31 +02:00
|
|
|
wg sync.WaitGroup
|
|
|
|
labels map[string]string
|
2020-05-10 18:58:17 +02:00
|
|
|
|
|
|
|
groupsMu sync.RWMutex
|
|
|
|
groups map[uint64]*Group
|
|
|
|
}
|
|
|
|
|
2020-06-01 12:46:37 +02:00
|
|
|
// AlertAPI generates APIAlert object from alert by its ID(hash)
|
2020-05-10 18:58:17 +02:00
|
|
|
func (m *manager) AlertAPI(gID, aID uint64) (*APIAlert, error) {
|
|
|
|
m.groupsMu.RLock()
|
|
|
|
defer m.groupsMu.RUnlock()
|
|
|
|
|
|
|
|
g, ok := m.groups[gID]
|
|
|
|
if !ok {
|
|
|
|
return nil, fmt.Errorf("can't find group with id %q", gID)
|
|
|
|
}
|
|
|
|
for _, rule := range g.Rules {
|
2020-06-01 12:46:37 +02:00
|
|
|
ar, ok := rule.(*AlertingRule)
|
|
|
|
if !ok {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
if apiAlert := ar.AlertAPI(aID); apiAlert != nil {
|
2020-05-10 18:58:17 +02:00
|
|
|
return apiAlert, nil
|
|
|
|
}
|
|
|
|
}
|
2020-06-01 12:46:37 +02:00
|
|
|
return nil, fmt.Errorf("can't find alert with id %q in group %q", aID, g.Name)
|
2020-05-10 18:58:17 +02:00
|
|
|
}
|
|
|
|
|
2021-05-25 15:27:22 +02:00
|
|
|
func (m *manager) start(ctx context.Context, groupsCfg []config.Group) error {
|
|
|
|
return m.update(ctx, groupsCfg, true)
|
2020-05-10 18:58:17 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
func (m *manager) close() {
|
|
|
|
if m.rw != nil {
|
|
|
|
err := m.rw.Close()
|
|
|
|
if err != nil {
|
|
|
|
logger.Fatalf("cannot stop the remotewrite: %s", err)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
m.wg.Wait()
|
|
|
|
}
|
|
|
|
|
2021-05-05 09:07:19 +02:00
|
|
|
func (m *manager) startGroup(ctx context.Context, group *Group, restore bool) error {
|
2020-05-13 20:32:58 +02:00
|
|
|
if restore && m.rr != nil {
|
2020-07-28 13:20:31 +02:00
|
|
|
err := group.Restore(ctx, m.rr, *remoteReadLookBack, m.labels)
|
2020-05-10 18:58:17 +02:00
|
|
|
if err != nil {
|
2021-05-10 10:06:31 +02:00
|
|
|
if !*remoteReadIgnoreRestoreErrors {
|
|
|
|
return fmt.Errorf("failed to restore state for group %q: %w", group.Name, err)
|
|
|
|
}
|
|
|
|
logger.Errorf("error while restoring state for group %q: %s", group.Name, err)
|
2020-05-10 18:58:17 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
m.wg.Add(1)
|
|
|
|
id := group.ID()
|
|
|
|
go func() {
|
2021-04-28 22:41:15 +02:00
|
|
|
group.start(ctx, m.notifiers, m.rw)
|
2020-05-10 18:58:17 +02:00
|
|
|
m.wg.Done()
|
|
|
|
}()
|
2020-06-01 12:46:37 +02:00
|
|
|
m.groups[id] = group
|
2021-05-05 09:07:19 +02:00
|
|
|
return nil
|
2020-05-10 18:58:17 +02:00
|
|
|
}
|
|
|
|
|
2021-05-25 15:27:22 +02:00
|
|
|
func (m *manager) update(ctx context.Context, groupsCfg []config.Group, restore bool) error {
|
2021-11-30 00:18:48 +01:00
|
|
|
var rrPresent, arPresent bool
|
2020-06-01 12:46:37 +02:00
|
|
|
groupsRegistry := make(map[uint64]*Group)
|
|
|
|
for _, cfg := range groupsCfg {
|
2021-11-30 00:18:48 +01:00
|
|
|
for _, r := range cfg.Rules {
|
|
|
|
if rrPresent && arPresent {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
if r.Record != "" {
|
|
|
|
rrPresent = true
|
|
|
|
}
|
|
|
|
if r.Alert != "" {
|
|
|
|
arPresent = true
|
|
|
|
}
|
|
|
|
}
|
2021-04-28 22:41:15 +02:00
|
|
|
ng := newGroup(cfg, m.querierBuilder, *evaluationInterval, m.labels)
|
2020-05-10 18:58:17 +02:00
|
|
|
groupsRegistry[ng.ID()] = ng
|
|
|
|
}
|
|
|
|
|
2021-11-30 00:18:48 +01:00
|
|
|
if rrPresent && m.rw == nil {
|
|
|
|
return fmt.Errorf("config contains recording rules but `-remoteWrite.url` isn't set")
|
|
|
|
}
|
|
|
|
if arPresent && m.notifiers == nil {
|
|
|
|
return fmt.Errorf("config contains alerting rules but `-notifier.url` isn't set")
|
|
|
|
}
|
|
|
|
|
2020-09-11 21:14:30 +02:00
|
|
|
type updateItem struct {
|
|
|
|
old *Group
|
|
|
|
new *Group
|
|
|
|
}
|
|
|
|
var toUpdate []updateItem
|
|
|
|
|
2020-05-10 18:58:17 +02:00
|
|
|
m.groupsMu.Lock()
|
|
|
|
for _, og := range m.groups {
|
2020-05-17 16:12:09 +02:00
|
|
|
ng, ok := groupsRegistry[og.ID()]
|
2020-05-10 18:58:17 +02:00
|
|
|
if !ok {
|
2020-09-11 21:14:30 +02:00
|
|
|
// old group is not present in new list,
|
|
|
|
// so must be stopped and deleted
|
2020-05-10 18:58:17 +02:00
|
|
|
og.close()
|
|
|
|
delete(m.groups, og.ID())
|
|
|
|
og = nil
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
delete(groupsRegistry, ng.ID())
|
2020-09-11 21:14:30 +02:00
|
|
|
if og.Checksum != ng.Checksum {
|
|
|
|
toUpdate = append(toUpdate, updateItem{old: og, new: ng})
|
|
|
|
}
|
2020-05-10 18:58:17 +02:00
|
|
|
}
|
|
|
|
for _, ng := range groupsRegistry {
|
2021-05-05 09:07:19 +02:00
|
|
|
if err := m.startGroup(ctx, ng, restore); err != nil {
|
|
|
|
return err
|
|
|
|
}
|
2020-05-10 18:58:17 +02:00
|
|
|
}
|
|
|
|
m.groupsMu.Unlock()
|
2020-09-11 21:14:30 +02:00
|
|
|
|
|
|
|
if len(toUpdate) > 0 {
|
|
|
|
var wg sync.WaitGroup
|
|
|
|
for _, item := range toUpdate {
|
|
|
|
wg.Add(1)
|
|
|
|
go func(old *Group, new *Group) {
|
|
|
|
old.updateCh <- new
|
|
|
|
wg.Done()
|
|
|
|
}(item.old, item.new)
|
|
|
|
}
|
|
|
|
wg.Wait()
|
|
|
|
}
|
2020-05-10 18:58:17 +02:00
|
|
|
return nil
|
|
|
|
}
|
2020-06-01 12:46:37 +02:00
|
|
|
|
|
|
|
func (g *Group) toAPI() APIGroup {
|
2020-09-11 21:14:30 +02:00
|
|
|
g.mu.RLock()
|
|
|
|
defer g.mu.RUnlock()
|
|
|
|
|
2020-06-01 12:46:37 +02:00
|
|
|
ag := APIGroup{
|
2020-09-11 21:14:30 +02:00
|
|
|
// encode as string to avoid rounding
|
2021-05-22 23:26:01 +02:00
|
|
|
ID: fmt.Sprintf("%d", g.ID()),
|
|
|
|
|
|
|
|
Name: g.Name,
|
|
|
|
Type: g.Type.String(),
|
|
|
|
File: g.File,
|
|
|
|
Interval: g.Interval.String(),
|
|
|
|
Concurrency: g.Concurrency,
|
|
|
|
ExtraFilterLabels: g.ExtraFilterLabels,
|
2021-08-31 13:52:34 +02:00
|
|
|
Labels: g.Labels,
|
2020-06-01 12:46:37 +02:00
|
|
|
}
|
|
|
|
for _, r := range g.Rules {
|
|
|
|
switch v := r.(type) {
|
|
|
|
case *AlertingRule:
|
|
|
|
ag.AlertingRules = append(ag.AlertingRules, v.RuleAPI())
|
|
|
|
case *RecordingRule:
|
|
|
|
ag.RecordingRules = append(ag.RecordingRules, v.RuleAPI())
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return ag
|
|
|
|
}
|