app/vmauth: properly check for backend health before leastLoaded policy (#7489)

Previously, vmauth could have pick `buMin` as least loaded backend
without checking its status. In result, vmauth could have respond to the
user with an error even if there were healthy backends. That could
happen if healthy backends already had non-zero amount of concurrent
requests executing at the moment of least-loaded backend choosing logic.

Steps to reproduce:
1. Setup vmauth with two backends: healthy and non-healthy
2. Execute a bunch of concurrent requests against vmauth (i.e. Grafana
dash reload)
3. Observe that some requests will fail with message that all backends
are unavailable

Addresses https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3061
---
Signed-off-by: hagen1778 <roman@victoriametrics.com>
(cherry picked from commit a0a154511a)
This commit is contained in:
Roman Khavronenko 2024-11-08 11:45:16 +01:00 committed by hagen1778
parent e60cce54a8
commit 07ad4f82cd
No known key found for this signature in database
GPG Key ID: E92986095E0DD614
3 changed files with 28 additions and 2 deletions

View File

@ -462,13 +462,17 @@ func getLeastLoadedBackendURL(bus []*backendURL, atomicCounter *atomic.Uint32) *
// Slow path - select other backend urls. // Slow path - select other backend urls.
n := atomicCounter.Add(1) - 1 n := atomicCounter.Add(1) - 1
buMin := bus[n%uint32(len(bus))]
for i := uint32(0); i < uint32(len(bus)); i++ { for i := uint32(0); i < uint32(len(bus)); i++ {
idx := (n + i) % uint32(len(bus)) idx := (n + i) % uint32(len(bus))
bu := bus[idx] bu := bus[idx]
if bu.isBroken() { if bu.isBroken() {
continue continue
} }
if buMin.isBroken() {
// verify that buMin isn't set as broken
buMin = bu
}
if bu.concurrentRequests.Load() == 0 { if bu.concurrentRequests.Load() == 0 {
// Fast path - return the backend with zero concurrently executed requests. // Fast path - return the backend with zero concurrently executed requests.
// Do not use CompareAndSwap() instead of Load(), since it is much slower on systems with many CPU cores. // Do not use CompareAndSwap() instead of Load(), since it is much slower on systems with many CPU cores.
@ -478,7 +482,6 @@ func getLeastLoadedBackendURL(bus []*backendURL, atomicCounter *atomic.Uint32) *
} }
// Slow path - return the backend with the minimum number of concurrently executed requests. // Slow path - return the backend with the minimum number of concurrently executed requests.
buMin := bus[n%uint32(len(bus))]
minRequests := buMin.concurrentRequests.Load() minRequests := buMin.concurrentRequests.Load()
for _, bu := range bus { for _, bu := range bus {
if bu.isBroken() { if bu.isBroken() {

View File

@ -777,6 +777,28 @@ func TestGetLeastLoadedBackendURL(t *testing.T) {
fn(7, 7, 7) fn(7, 7, 7)
} }
func TestBrokenBackend(t *testing.T) {
up := mustParseURLs([]string{
"http://node1:343",
"http://node2:343",
"http://node3:343",
})
up.loadBalancingPolicy = "least_loaded"
pbus := up.bus.Load()
bus := *pbus
// explicitly mark one of the backends as broken
bus[1].setBroken()
// broken backend should never return while there are healthy backends
for i := 0; i < 1e3; i++ {
b := up.getBackendURL()
if b.isBroken() {
t.Fatalf("unexpected broken backend %q", b.url)
}
}
}
func getRegexs(paths []string) []*Regex { func getRegexs(paths []string) []*Regex {
var sps []*Regex var sps []*Regex
for _, path := range paths { for _, path := range paths {

View File

@ -21,6 +21,7 @@ See also [LTS releases](https://docs.victoriametrics.com/lts-releases/).
* BUGFIX: [vmctl](https://docs.victoriametrics.com/vmctl/): drop rows that do not belong to the current series during import. The dropped rows should belong to another series whose tags are a superset of the current series. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/7301) and [this pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/7330). Thanks to @dpedu for reporting and cooperating with the test. * BUGFIX: [vmctl](https://docs.victoriametrics.com/vmctl/): drop rows that do not belong to the current series during import. The dropped rows should belong to another series whose tags are a superset of the current series. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/7301) and [this pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/7330). Thanks to @dpedu for reporting and cooperating with the test.
* BUGFIX: [vmsingle](https://docs.victoriametrics.com/single-server-victoriametrics/), `vmselect` in [VictoriaMetrics cluster](https://docs.victoriametrics.com/cluster-victoriametrics/): keep the order of resulting time series when `limit_offset` is applied. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/7068). * BUGFIX: [vmsingle](https://docs.victoriametrics.com/single-server-victoriametrics/), `vmselect` in [VictoriaMetrics cluster](https://docs.victoriametrics.com/cluster-victoriametrics/): keep the order of resulting time series when `limit_offset` is applied. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/7068).
* BUGFIX: [graphite](https://docs.victoriametrics.com/#graphite-render-api-usage): properly handle xFilesFactor=0 for `transformRemoveEmptySeries` function. See [this PR](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/7337) for details. * BUGFIX: [graphite](https://docs.victoriametrics.com/#graphite-render-api-usage): properly handle xFilesFactor=0 for `transformRemoveEmptySeries` function. See [this PR](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/7337) for details.
* BUGFIX: [vmauth](https://docs.victoriametrics.com/vmauth): properly check availability of all the backends before giving up when proxying requests. Previously, vmauth could return an error even if there were healthy backends available. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3061) for details.
## [v1.106.0](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.106.0) ## [v1.106.0](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.106.0)