mirror of
https://github.com/VictoriaMetrics/VictoriaMetrics.git
synced 2025-01-22 00:00:39 +01:00
vmalert: fix blocking hot-reload process if the old rule group hasn't started yet (#7258)
Group [sleeps](daa7183749/app/vmalert/rule/group.go (L320)
) random duration before start the evaluation, and during the sleep, `g.updateCh <- new` will be blocked since there is no `<-g.updateCh` waiting. --------- Signed-off-by: hagen1778 <roman@victoriametrics.com> Co-authored-by: hagen1778 <roman@victoriametrics.com> (cherry picked from commitc4fe23794a
) Signed-off-by: hagen1778 <roman@victoriametrics.com>
This commit is contained in:
parent
2314668937
commit
abd2f34833
@ -324,14 +324,28 @@ func (g *Group) Start(ctx context.Context, nts func() []notifier.Notifier, rw re
|
||||
g.infof("will start in %v", sleepBeforeStart)
|
||||
|
||||
sleepTimer := time.NewTimer(sleepBeforeStart)
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
sleepTimer.Stop()
|
||||
return
|
||||
case <-g.doneCh:
|
||||
sleepTimer.Stop()
|
||||
return
|
||||
case <-sleepTimer.C:
|
||||
randSleep:
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
sleepTimer.Stop()
|
||||
return
|
||||
case <-g.doneCh:
|
||||
sleepTimer.Stop()
|
||||
return
|
||||
case ng := <-g.updateCh:
|
||||
g.mu.Lock()
|
||||
err := g.updateWith(ng)
|
||||
if err != nil {
|
||||
logger.Errorf("group %q: failed to update: %s", g.Name, err)
|
||||
g.mu.Unlock()
|
||||
continue
|
||||
}
|
||||
g.mu.Unlock()
|
||||
g.infof("reload successfully")
|
||||
case <-sleepTimer.C:
|
||||
break randSleep
|
||||
}
|
||||
}
|
||||
evalTS = evalTS.Add(sleepBeforeStart)
|
||||
}
|
||||
|
@ -175,6 +175,74 @@ func TestUpdateWith(t *testing.T) {
|
||||
})
|
||||
}
|
||||
|
||||
func TestUpdateDuringRandSleep(t *testing.T) {
|
||||
// enable rand sleep to test group update during sleep
|
||||
SkipRandSleepOnGroupStart = false
|
||||
defer func() {
|
||||
SkipRandSleepOnGroupStart = true
|
||||
}()
|
||||
rule := AlertingRule{
|
||||
Name: "jobDown",
|
||||
Expr: "up==0",
|
||||
Labels: map[string]string{
|
||||
"foo": "bar",
|
||||
},
|
||||
}
|
||||
g := &Group{
|
||||
Name: "test",
|
||||
Rules: []Rule{
|
||||
&rule,
|
||||
},
|
||||
// big interval ensures big enough randSleep during start process
|
||||
Interval: 100 * time.Hour,
|
||||
updateCh: make(chan *Group),
|
||||
}
|
||||
go g.Start(context.Background(), nil, nil, nil)
|
||||
|
||||
rule1 := AlertingRule{
|
||||
Name: "jobDown",
|
||||
Expr: "up{job=\"vmagent\"}==0",
|
||||
Labels: map[string]string{
|
||||
"foo": "bar",
|
||||
},
|
||||
}
|
||||
g1 := &Group{
|
||||
Rules: []Rule{
|
||||
&rule1,
|
||||
},
|
||||
}
|
||||
g.updateCh <- g1
|
||||
time.Sleep(10 * time.Millisecond)
|
||||
g.mu.RLock()
|
||||
if g.Rules[0].(*AlertingRule).Expr != "up{job=\"vmagent\"}==0" {
|
||||
t.Fatalf("expected to have updated rule expr")
|
||||
}
|
||||
g.mu.RUnlock()
|
||||
|
||||
rule2 := AlertingRule{
|
||||
Name: "jobDown",
|
||||
Expr: "up{job=\"vmagent\"}==0",
|
||||
Labels: map[string]string{
|
||||
"foo": "bar",
|
||||
"baz": "qux",
|
||||
},
|
||||
}
|
||||
g2 := &Group{
|
||||
Rules: []Rule{
|
||||
&rule2,
|
||||
},
|
||||
}
|
||||
g.updateCh <- g2
|
||||
time.Sleep(10 * time.Millisecond)
|
||||
g.mu.RLock()
|
||||
if len(g.Rules[0].(*AlertingRule).Labels) != 2 {
|
||||
t.Fatalf("expected to have updated labels")
|
||||
}
|
||||
g.mu.RUnlock()
|
||||
|
||||
g.Close()
|
||||
}
|
||||
|
||||
func TestGroupStart(t *testing.T) {
|
||||
const (
|
||||
rules = `
|
||||
|
@ -28,12 +28,13 @@ See also [LTS releases](https://docs.victoriametrics.com/lts-releases/).
|
||||
|
||||
* BUGFIX: [vmgateway](https://docs.victoriametrics.com/vmgateway/): fix possible panic during parsing of a token without `vm_access` claim. This issue was introduced in v1.104.0.
|
||||
* BUGFIX: [vmui](https://docs.victoriametrics.com/#vmui): fix error messages rendering from overflowing the screen with long messages. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/7207).
|
||||
* BUGFIX: `vmselect` in [VictoriaMetrics cluster](https://docs.victoriametrics.com/cluster-victoriametrics/): properly process response in [multi-level cluster setup](https://docs.victoriametrics.com/cluster-victoriametrics/#multi-level-cluster-setup). Before, vmselect could return no data in multi-level setup. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/7270) for details. The issue was introduced in [v1.104.0](https://docs.victoriametrics.com/changelog/#v11040).
|
||||
* BUGFIX: [vmctl](https://docs.victoriametrics.com/vmctl/): properly add metrics tags for `opentsdb` migration source. Previously it could have empty values. See [this PR](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/7161).
|
||||
* BUGFIX: [vmalert-tool](https://docs.victoriametrics.com/vmalert-tool/): reduce the initial health check interval for datasource. This reduces the time spent on evaluating rules by vmalert-tool. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6970).
|
||||
* BUGFIX: [vmalert-tool](https://docs.victoriametrics.com/vmalert-tool/): allow specifying empty labels list `labels: '{}'` in the same way as promtool does. This improves compatibility between these tools.
|
||||
* BUGFIX: [vmagent](https://docs.victoriametrics.com/vmagent/): support `m` unit for `minutes` duration in command-line flag `-streamAggr.dedupInterval`. Previously unit `m` wasn't supported correctly.
|
||||
* BUGFIX: [VictoriaMetrics cluster](https://docs.victoriametrics.com/cluster-victoriametrics/): properly apply replication factor when storage node groups are used and replication factor is configured via global value such as `-replicationFactor=2`. Previously, global replication factor was ignored for storage node groups. See [these docs](https://docs.victoriametrics.com/cluster-victoriametrics/#vmstorage-groups-at-vmselect) for more information about storage groups configuration.
|
||||
* BUGFIX: `vmselect` in [VictoriaMetrics cluster](https://docs.victoriametrics.com/cluster-victoriametrics/): properly process response in [multi-level cluster setup](https://docs.victoriametrics.com/cluster-victoriametrics/#multi-level-cluster-setup). Before, vmselect could return no data in multi-level setup. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/7270) for details. The issue was introduced in [v1.104.0](https://docs.victoriametrics.com/changelog/#v11040).
|
||||
* BUGFIX: [vmalert](https://docs.victoriametrics.com/vmalert): properly apply configuration changes during hot-reload to rule groups that haven't started yet. Previously, configuration updates to such groups could have resulted into blocking all evaluations within the group, until vmalert restart.
|
||||
|
||||
## [v1.104.0](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.104.0)
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user