vmalert: fix blocking hot-reload process if the old rule group hasn't started yet (#7258)

Group
[sleeps](daa7183749/app/vmalert/rule/group.go (L320))
random duration before start the evaluation, and during the sleep,
`g.updateCh <- new` will be blocked since there is no `<-g.updateCh`
waiting.

---------

Signed-off-by: hagen1778 <roman@victoriametrics.com>
Co-authored-by: hagen1778 <roman@victoriametrics.com>

(cherry picked from commit c4fe23794a)
Signed-off-by: hagen1778 <roman@victoriametrics.com>
This commit is contained in:
Hui Wang 2024-10-18 11:18:24 +02:00 committed by hagen1778
parent 2314668937
commit abd2f34833
No known key found for this signature in database
GPG Key ID: E92986095E0DD614
3 changed files with 92 additions and 9 deletions

View File

@ -324,14 +324,28 @@ func (g *Group) Start(ctx context.Context, nts func() []notifier.Notifier, rw re
g.infof("will start in %v", sleepBeforeStart)
sleepTimer := time.NewTimer(sleepBeforeStart)
select {
case <-ctx.Done():
sleepTimer.Stop()
return
case <-g.doneCh:
sleepTimer.Stop()
return
case <-sleepTimer.C:
randSleep:
for {
select {
case <-ctx.Done():
sleepTimer.Stop()
return
case <-g.doneCh:
sleepTimer.Stop()
return
case ng := <-g.updateCh:
g.mu.Lock()
err := g.updateWith(ng)
if err != nil {
logger.Errorf("group %q: failed to update: %s", g.Name, err)
g.mu.Unlock()
continue
}
g.mu.Unlock()
g.infof("reload successfully")
case <-sleepTimer.C:
break randSleep
}
}
evalTS = evalTS.Add(sleepBeforeStart)
}

View File

@ -175,6 +175,74 @@ func TestUpdateWith(t *testing.T) {
})
}
func TestUpdateDuringRandSleep(t *testing.T) {
// enable rand sleep to test group update during sleep
SkipRandSleepOnGroupStart = false
defer func() {
SkipRandSleepOnGroupStart = true
}()
rule := AlertingRule{
Name: "jobDown",
Expr: "up==0",
Labels: map[string]string{
"foo": "bar",
},
}
g := &Group{
Name: "test",
Rules: []Rule{
&rule,
},
// big interval ensures big enough randSleep during start process
Interval: 100 * time.Hour,
updateCh: make(chan *Group),
}
go g.Start(context.Background(), nil, nil, nil)
rule1 := AlertingRule{
Name: "jobDown",
Expr: "up{job=\"vmagent\"}==0",
Labels: map[string]string{
"foo": "bar",
},
}
g1 := &Group{
Rules: []Rule{
&rule1,
},
}
g.updateCh <- g1
time.Sleep(10 * time.Millisecond)
g.mu.RLock()
if g.Rules[0].(*AlertingRule).Expr != "up{job=\"vmagent\"}==0" {
t.Fatalf("expected to have updated rule expr")
}
g.mu.RUnlock()
rule2 := AlertingRule{
Name: "jobDown",
Expr: "up{job=\"vmagent\"}==0",
Labels: map[string]string{
"foo": "bar",
"baz": "qux",
},
}
g2 := &Group{
Rules: []Rule{
&rule2,
},
}
g.updateCh <- g2
time.Sleep(10 * time.Millisecond)
g.mu.RLock()
if len(g.Rules[0].(*AlertingRule).Labels) != 2 {
t.Fatalf("expected to have updated labels")
}
g.mu.RUnlock()
g.Close()
}
func TestGroupStart(t *testing.T) {
const (
rules = `

View File

@ -28,12 +28,13 @@ See also [LTS releases](https://docs.victoriametrics.com/lts-releases/).
* BUGFIX: [vmgateway](https://docs.victoriametrics.com/vmgateway/): fix possible panic during parsing of a token without `vm_access` claim. This issue was introduced in v1.104.0.
* BUGFIX: [vmui](https://docs.victoriametrics.com/#vmui): fix error messages rendering from overflowing the screen with long messages. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/7207).
* BUGFIX: `vmselect` in [VictoriaMetrics cluster](https://docs.victoriametrics.com/cluster-victoriametrics/): properly process response in [multi-level cluster setup](https://docs.victoriametrics.com/cluster-victoriametrics/#multi-level-cluster-setup). Before, vmselect could return no data in multi-level setup. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/7270) for details. The issue was introduced in [v1.104.0](https://docs.victoriametrics.com/changelog/#v11040).
* BUGFIX: [vmctl](https://docs.victoriametrics.com/vmctl/): properly add metrics tags for `opentsdb` migration source. Previously it could have empty values. See [this PR](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/7161).
* BUGFIX: [vmalert-tool](https://docs.victoriametrics.com/vmalert-tool/): reduce the initial health check interval for datasource. This reduces the time spent on evaluating rules by vmalert-tool. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6970).
* BUGFIX: [vmalert-tool](https://docs.victoriametrics.com/vmalert-tool/): allow specifying empty labels list `labels: '{}'` in the same way as promtool does. This improves compatibility between these tools.
* BUGFIX: [vmagent](https://docs.victoriametrics.com/vmagent/): support `m` unit for `minutes` duration in command-line flag `-streamAggr.dedupInterval`. Previously unit `m` wasn't supported correctly.
* BUGFIX: [VictoriaMetrics cluster](https://docs.victoriametrics.com/cluster-victoriametrics/): properly apply replication factor when storage node groups are used and replication factor is configured via global value such as `-replicationFactor=2`. Previously, global replication factor was ignored for storage node groups. See [these docs](https://docs.victoriametrics.com/cluster-victoriametrics/#vmstorage-groups-at-vmselect) for more information about storage groups configuration.
* BUGFIX: `vmselect` in [VictoriaMetrics cluster](https://docs.victoriametrics.com/cluster-victoriametrics/): properly process response in [multi-level cluster setup](https://docs.victoriametrics.com/cluster-victoriametrics/#multi-level-cluster-setup). Before, vmselect could return no data in multi-level setup. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/7270) for details. The issue was introduced in [v1.104.0](https://docs.victoriametrics.com/changelog/#v11040).
* BUGFIX: [vmalert](https://docs.victoriametrics.com/vmalert): properly apply configuration changes during hot-reload to rule groups that haven't started yet. Previously, configuration updates to such groups could have resulted into blocking all evaluations within the group, until vmalert restart.
## [v1.104.0](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.104.0)