vmalert: fix blocking hot-reload process if the old rule group hasn't started yet (#7258)

Group
[sleeps](daa7183749/app/vmalert/rule/group.go (L320))
random duration before start the evaluation, and during the sleep,
`g.updateCh <- new` will be blocked since there is no `<-g.updateCh`
waiting.

---------

Signed-off-by: hagen1778 <roman@victoriametrics.com>
Co-authored-by: hagen1778 <roman@victoriametrics.com>
This commit is contained in:
Hui Wang 2024-10-18 17:18:24 +08:00 committed by GitHub
parent 41e0bbb6d1
commit c4fe23794a
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 91 additions and 8 deletions

View File

@ -324,6 +324,8 @@ func (g *Group) Start(ctx context.Context, nts func() []notifier.Notifier, rw re
g.infof("will start in %v", sleepBeforeStart)
sleepTimer := time.NewTimer(sleepBeforeStart)
randSleep:
for {
select {
case <-ctx.Done():
sleepTimer.Stop()
@ -331,7 +333,19 @@ func (g *Group) Start(ctx context.Context, nts func() []notifier.Notifier, rw re
case <-g.doneCh:
sleepTimer.Stop()
return
case ng := <-g.updateCh:
g.mu.Lock()
err := g.updateWith(ng)
if err != nil {
logger.Errorf("group %q: failed to update: %s", g.Name, err)
g.mu.Unlock()
continue
}
g.mu.Unlock()
g.infof("reload successfully")
case <-sleepTimer.C:
break randSleep
}
}
evalTS = evalTS.Add(sleepBeforeStart)
}

View File

@ -175,6 +175,74 @@ func TestUpdateWith(t *testing.T) {
})
}
func TestUpdateDuringRandSleep(t *testing.T) {
// enable rand sleep to test group update during sleep
SkipRandSleepOnGroupStart = false
defer func() {
SkipRandSleepOnGroupStart = true
}()
rule := AlertingRule{
Name: "jobDown",
Expr: "up==0",
Labels: map[string]string{
"foo": "bar",
},
}
g := &Group{
Name: "test",
Rules: []Rule{
&rule,
},
// big interval ensures big enough randSleep during start process
Interval: 100 * time.Hour,
updateCh: make(chan *Group),
}
go g.Start(context.Background(), nil, nil, nil)
rule1 := AlertingRule{
Name: "jobDown",
Expr: "up{job=\"vmagent\"}==0",
Labels: map[string]string{
"foo": "bar",
},
}
g1 := &Group{
Rules: []Rule{
&rule1,
},
}
g.updateCh <- g1
time.Sleep(10 * time.Millisecond)
g.mu.RLock()
if g.Rules[0].(*AlertingRule).Expr != "up{job=\"vmagent\"}==0" {
t.Fatalf("expected to have updated rule expr")
}
g.mu.RUnlock()
rule2 := AlertingRule{
Name: "jobDown",
Expr: "up{job=\"vmagent\"}==0",
Labels: map[string]string{
"foo": "bar",
"baz": "qux",
},
}
g2 := &Group{
Rules: []Rule{
&rule2,
},
}
g.updateCh <- g2
time.Sleep(10 * time.Millisecond)
g.mu.RLock()
if len(g.Rules[0].(*AlertingRule).Labels) != 2 {
t.Fatalf("expected to have updated labels")
}
g.mu.RUnlock()
g.Close()
}
func TestGroupStart(t *testing.T) {
const (
rules = `

View File

@ -34,6 +34,7 @@ See also [LTS releases](https://docs.victoriametrics.com/lts-releases/).
* BUGFIX: [vmagent](https://docs.victoriametrics.com/vmagent/): support `m` unit for `minutes` duration in command-line flag `-streamAggr.dedupInterval`. Previously unit `m` wasn't supported correctly.
* BUGFIX: [VictoriaMetrics cluster](https://docs.victoriametrics.com/cluster-victoriametrics/): properly apply replication factor when storage node groups are used and replication factor is configured via global value such as `-replicationFactor=2`. Previously, global replication factor was ignored for storage node groups. See [these docs](https://docs.victoriametrics.com/cluster-victoriametrics/#vmstorage-groups-at-vmselect) for more information about storage groups configuration.
* BUGFIX: `vmselect` in [VictoriaMetrics cluster](https://docs.victoriametrics.com/cluster-victoriametrics/): properly process response in [multi-level cluster setup](https://docs.victoriametrics.com/cluster-victoriametrics/#multi-level-cluster-setup). Before, vmselect could return no data in multi-level setup. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/7270) for details. The issue was introduced in [v1.104.0](https://docs.victoriametrics.com/changelog/#v11040).
* BUGFIX: [vmalert](https://docs.victoriametrics.com/vmalert): properly apply configuration changes during hot-reload to rule groups that haven't started yet. Previously, configuration updates to such groups could have resulted into blocking all evaluations within the group, until vmalert restart.
## [v1.104.0](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.104.0)