vmalert: update groups on config reload only if changes detected (#759)

On config reload event `vmalert` reloads configuration for every group. While
it works for simple configurations, the more complex and heavy installations may
suffer from frequent config reloads.
The change introduces the `checksum` field for every group and is set to md5 hash
of yaml configuration. The checksum will change if on any change to group
definition like rules order or annotation change. Comparing the `checksum` field
on config reload event helps to detect if group should be updated.
The groups update is now done concurrently, so reload duration will be limited by
the slowest group now.

Partially solves #691 by improving config reload speed.
This commit is contained in:
Roman Khavronenko 2020-09-11 20:14:30 +01:00 committed by Aliaksandr Valialkin
parent e53235ac5c
commit 16e0bb496e
4 changed files with 101 additions and 16 deletions

View File

@ -1,6 +1,7 @@
package config
import (
"crypto/md5"
"fmt"
"hash/fnv"
"io/ioutil"
@ -24,11 +25,30 @@ type Group struct {
Interval time.Duration `yaml:"interval,omitempty"`
Rules []Rule `yaml:"rules"`
Concurrency int `yaml:"concurrency"`
// Checksum stores the hash of yaml definition for this group.
// May be used to detect any changes like rules re-ordering etc.
Checksum string
// Catches all undefined fields and must be empty after parsing.
XXX map[string]interface{} `yaml:",inline"`
}
// UnmarshalYAML implements the yaml.Unmarshaler interface.
func (g *Group) UnmarshalYAML(unmarshal func(interface{}) error) error {
type group Group
if err := unmarshal((*group)(g)); err != nil {
return err
}
b, err := yaml.Marshal(g)
if err != nil {
return fmt.Errorf("failed to marshal group configuration for checksum: %s", err)
}
h := md5.New()
h.Write(b)
g.Checksum = fmt.Sprintf("%x", h.Sum(nil))
return nil
}
// Validate check for internal Group or Rule configuration errors
func (g *Group) Validate(validateAnnotations, validateExpressions bool) error {
if g.Name == "" {
@ -101,7 +121,7 @@ func (r *Rule) Name() string {
}
// HashRule hashes significant Rule fields into
// unique hash value
// unique hash that supposed to define Rule uniqueness
func HashRule(r Rule) uint64 {
h := fnv.New64a()
h.Write([]byte(r.Expr))
@ -112,16 +132,7 @@ func HashRule(r Rule) uint64 {
h.Write([]byte("alerting"))
h.Write([]byte(r.Alert))
}
type item struct {
key, value string
}
var kv []item
for k, v := range r.Labels {
kv = append(kv, item{key: k, value: v})
}
sort.Slice(kv, func(i, j int) bool {
return kv[i].key < kv[j].key
})
kv := sortMap(r.Labels)
for _, i := range kv {
h.Write([]byte(i.key))
h.Write([]byte(i.value))
@ -204,3 +215,18 @@ func checkOverflow(m map[string]interface{}, ctx string) error {
}
return nil
}
type item struct {
key, value string
}
func sortMap(m map[string]string) []item {
var kv []item
for k, v := range m {
kv = append(kv, item{key: k, value: v})
}
sort.Slice(kv, func(i, j int) bool {
return kv[i].key < kv[j].key
})
return kv
}

View File

@ -8,6 +8,7 @@ import (
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/app/vmalert/notifier"
"gopkg.in/yaml.v2"
)
func TestMain(m *testing.M) {
@ -320,3 +321,36 @@ func TestHashRule(t *testing.T) {
}
}
}
func TestGroupChecksum(t *testing.T) {
data := `
name: TestGroup
rules:
- alert: ExampleAlertAlwaysFiring
expr: sum by(job) (up == 1)
- record: handler:requests:rate5m
expr: sum(rate(prometheus_http_requests_total[5m])) by (handler)
`
var g Group
if err := yaml.Unmarshal([]byte(data), &g); err != nil {
t.Fatalf("failed to unmarshal: %s", err)
}
if g.Checksum == "" {
t.Fatalf("expected to get non-empty checksum")
}
newData := `
name: TestGroup
rules:
- record: handler:requests:rate5m
expr: sum(rate(prometheus_http_requests_total[5m])) by (handler)
- alert: ExampleAlertAlwaysFiring
expr: sum by(job) (up == 1)
`
var ng Group
if err := yaml.Unmarshal([]byte(newData), &g); err != nil {
t.Fatalf("failed to unmarshal: %s", err)
}
if g.Checksum == ng.Checksum {
t.Fatalf("expected to get different checksums")
}
}

View File

@ -24,6 +24,7 @@ type Group struct {
Rules []Rule
Interval time.Duration
Concurrency int
Checksum string
doneCh chan struct{}
finishedCh chan struct{}
@ -53,6 +54,7 @@ func newGroup(cfg config.Group, defaultInterval time.Duration, labels map[string
File: cfg.File,
Interval: cfg.Interval,
Concurrency: cfg.Concurrency,
Checksum: cfg.Checksum,
doneCh: make(chan struct{}),
finishedCh: make(chan struct{}),
updateCh: make(chan *Group),
@ -156,6 +158,7 @@ func (g *Group) updateWith(newGroup *Group) error {
newRules = append(newRules, nr)
}
g.Concurrency = newGroup.Concurrency
g.Checksum = newGroup.Checksum
g.Rules = newRules
return nil
}

View File

@ -93,31 +93,53 @@ func (m *manager) update(ctx context.Context, path []string, validateTpl, valida
groupsRegistry[ng.ID()] = ng
}
type updateItem struct {
old *Group
new *Group
}
var toUpdate []updateItem
m.groupsMu.Lock()
for _, og := range m.groups {
ng, ok := groupsRegistry[og.ID()]
if !ok {
// old group is not present in new list
// and must be stopped and deleted
// old group is not present in new list,
// so must be stopped and deleted
og.close()
delete(m.groups, og.ID())
og = nil
continue
}
og.updateCh <- ng
delete(groupsRegistry, ng.ID())
if og.Checksum != ng.Checksum {
toUpdate = append(toUpdate, updateItem{old: og, new: ng})
}
}
for _, ng := range groupsRegistry {
m.startGroup(ctx, ng, restore)
}
m.groupsMu.Unlock()
if len(toUpdate) > 0 {
var wg sync.WaitGroup
for _, item := range toUpdate {
wg.Add(1)
go func(old *Group, new *Group) {
old.updateCh <- new
wg.Done()
}(item.old, item.new)
}
wg.Wait()
}
return nil
}
func (g *Group) toAPI() APIGroup {
g.mu.RLock()
defer g.mu.RUnlock()
ag := APIGroup{
// encode as strings to avoid rounding
// encode as string to avoid rounding
ID: fmt.Sprintf("%d", g.ID()),
Name: g.Name,
File: g.File,