mirror of
https://github.com/VictoriaMetrics/VictoriaMetrics.git
synced 2025-01-20 07:19:17 +01:00
app/vmalert: fix data race during hot-config reload
During hot-reload, the logic evokes the group update and rules evaluation interruption simultaneously. Falsely assuming that interruption happens before the update. However, it could happen that group will be updated first and only after the rules evaluation will be cancelled. Which will result in permanent interruption for all rules within the group. The fix caches the cancel context function into local variable first. And only after performs the group update. With cached cancel function we can safely call it without worrying that we cancel the evaluation for already updated group. Signed-off-by: hagen1778 <roman@victoriametrics.com>
This commit is contained in:
parent
105cb44884
commit
a4bb7e8932
@ -156,11 +156,14 @@ func (m *manager) update(ctx context.Context, groupsCfg []config.Group, restore
|
||||
var wg sync.WaitGroup
|
||||
for _, item := range toUpdate {
|
||||
wg.Add(1)
|
||||
// cache the cancel func before updating the Group, as it might change during the Update
|
||||
evalCancel := item.old.EvalCancelFn
|
||||
go func(old *rule.Group, new *rule.Group) {
|
||||
old.UpdateWith(new)
|
||||
wg.Done()
|
||||
}(item.old, item.new)
|
||||
item.old.InterruptEval()
|
||||
// cancel current group evaluation to update Group as fast as possible
|
||||
evalCancel()
|
||||
}
|
||||
wg.Wait()
|
||||
}
|
||||
|
@ -278,16 +278,18 @@ func (g *Group) updateWith(newGroup *Group) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
// InterruptEval interrupts in-flight rules evaluations
|
||||
// within the group. It is expected that g.evalCancel
|
||||
// will be repopulated after the call.
|
||||
func (g *Group) InterruptEval() {
|
||||
// EvalCancelFn returns the cancel function which would
|
||||
// stop the current evaluation of rules in this group.
|
||||
// Always returns non-nil result.
|
||||
func (g *Group) EvalCancelFn() context.CancelFunc {
|
||||
g.mu.RLock()
|
||||
defer g.mu.RUnlock()
|
||||
|
||||
cancelFn := func() {}
|
||||
if g.evalCancel != nil {
|
||||
g.evalCancel()
|
||||
cancelFn = g.evalCancel
|
||||
}
|
||||
return cancelFn
|
||||
}
|
||||
|
||||
// Close stops the group and it's rules, unregisters group metrics
|
||||
@ -296,7 +298,8 @@ func (g *Group) Close() {
|
||||
return
|
||||
}
|
||||
close(g.doneCh)
|
||||
g.InterruptEval()
|
||||
cancel := g.EvalCancelFn()
|
||||
cancel()
|
||||
<-g.finishedCh
|
||||
|
||||
g.metrics.iterationDuration.Unregister()
|
||||
|
@ -71,6 +71,7 @@ The sandbox cluster installation is running under the constant load generated by
|
||||
* BUGFIX: [vmalert](https://docs.victoriametrics.com/vmalert.html): check `-external.url` schema when starting vmalert, must be `http` or `https`. Before, alertmanager could reject alert notifications if `-external.url` contained no or wrong schema.
|
||||
* BUGFIX: [vmalert](https://docs.victoriametrics.com/vmalert.html): automatically add `exported_` prefix for original evaluation result label if it's conflicted with external or reserved one, previously it was overridden. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5161).
|
||||
* BUGFIX: [vmalert](https://docs.victoriametrics.com/vmalert.html): autogenerate `ALERTS_FOR_STATE` time series for alerting rules with `for: 0`. Previously, `ALERTS_FOR_STATE` was generated only for alerts with `for > 0`. The change aligns with Prometheus behavior. See more details in [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5648).
|
||||
* BUGFIX: [vmalert](https://docs.victoriametrics.com/vmalert.html): fix data race during hot-config reload. The result of the race could cause all rules from updated group to fail with `context cancelled` error.
|
||||
* BUGFIX: [MetricsQL](https://docs.victoriametrics.com/MetricsQL.html): consistently sort results for `q1 or q2` query, so they do not change colors with each refresh in Grafana. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5393).
|
||||
* BUGFIX: [MetricsQL](https://docs.victoriametrics.com/MetricsQL.html): properly return results from [bottomk](https://docs.victoriametrics.com/MetricsQL.html#bottomk) and `bottomk_*()` functions when some of these results contain NaN values. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5506). Thanks to @xiaozongyang for [the fix](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5509).
|
||||
* BUGFIX: [MetricsQL](https://docs.victoriametrics.com/MetricsQL.html): properly handle queries, which wrap [rollup functions](https://docs.victoriametrics.com/MetricsQL.html#rollup-functions) with multiple arguments without explicitly specified lookbehind window in square brackets into [aggregate functions](https://docs.victoriametrics.com/MetricsQL.html#aggregate-functions). For example, `sum(quantile_over_time(0.5, process_resident_memory_bytes))` was resulting to `expecting at least 2 args to ...; got 1 args` error. Thanks to @atykhyy for [the pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/5414).
|
||||
|
Loading…
Reference in New Issue
Block a user