From c888d76c4be201abddd9ebfe68ea85df2a424113 Mon Sep 17 00:00:00 2001 From: Aliaksandr Valialkin Date: Wed, 20 Dec 2023 19:53:46 +0200 Subject: [PATCH] app/vmselect/netstorage: make sure that at least a single result is collected from every storage group before deciding whether it is OK to skip results from the remaining storage nodes --- app/vmselect/netstorage/netstorage.go | 21 +++++++++++++++++---- docs/CHANGELOG.md | 1 + 2 files changed, 18 insertions(+), 4 deletions(-) diff --git a/app/vmselect/netstorage/netstorage.go b/app/vmselect/netstorage/netstorage.go index 08886657fd..eff53be7d8 100644 --- a/app/vmselect/netstorage/netstorage.go +++ b/app/vmselect/netstorage/netstorage.go @@ -1754,10 +1754,14 @@ func (snr *storageNodesRequest) collectAllResults(f func(result interface{}) err } func (snr *storageNodesRequest) collectResults(partialResultsCounter *metrics.Counter, f func(result interface{}) error) (bool, error) { - errsPartialPerGroup := make(map[*storageNodesGroup][]error) - resultsCollectedPerGroup := make(map[*storageNodesGroup]int) sns := snr.sns - for i := 0; i < len(sns); i++ { + if len(sns) == 0 { + return false, nil + } + groupsCount := sns[0].group.groupsCount + resultsCollectedPerGroup := make(map[*storageNodesGroup]int, groupsCount) + errsPartialPerGroup := make(map[*storageNodesGroup][]error) + for range sns { // There is no need in timer here, since all the goroutines executing the f function // passed to startStorageNodesRequest must be finished until the deadline. result := <-snr.resultsCh @@ -1799,7 +1803,7 @@ func (snr *storageNodesRequest) collectResults(partialResultsCounter *metrics.Co } snr.finishQueryTracer(result.qt, "") resultsCollectedPerGroup[group]++ - if *skipSlowReplicas { + if *skipSlowReplicas && len(resultsCollectedPerGroup) == groupsCount { canSkipSlowReplicas := true for g, n := range resultsCollectedPerGroup { if n <= g.nodesCount-g.replicationFactor { @@ -1870,6 +1874,9 @@ type storageNodesGroup struct { // the number of nodes in the group nodesCount int + + // groupsCount is the number of groups in the list the given group belongs to + groupsCount int } func initStorageNodeGroups(addrs []string) map[string]*storageNodesGroup { @@ -1886,6 +1893,12 @@ func initStorageNodeGroups(addrs []string) map[string]*storageNodesGroup { } g.nodesCount++ } + + groupsCount := len(m) + for _, g := range m { + g.groupsCount = groupsCount + } + return m } diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md index 957959e5ca..bdb5a470cc 100644 --- a/docs/CHANGELOG.md +++ b/docs/CHANGELOG.md @@ -33,6 +33,7 @@ The sandbox cluster installation is running under the constant load generated by * FEATURE: all VictoriaMetrics components: add ability to specify arbitrary HTTP headers to send with every request to `-pushmetrics.url`. See [`push metrics` docs](https://docs.victoriametrics.com/#push-metrics). * FEATURE: all VictoriaMetrics components: add `-metrics.exposeMetadata` command-line flag, which allows displaying `TYPE` and `HELP` metadata at `/metrics` page exposed at `-httpListenAddr`. This may be needed when the `/metrics` page is scraped by collector, which requires the `TYPE` and `HELP` metadata such as [Google Cloud Managed Prometheus](https://cloud.google.com/stackdriver/docs/managed-prometheus/troubleshooting#missing-metric-type). +* BUGFIX: [VictoriaMetrics cluster](https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html): properly return full results when `-search.skipSlowReplicas` command-line flag is passed to `vmselect` and when [vmstorage groups](https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html#vmstorage-groups-at-vmselect) are in use. Previously partial results could be returned in this case. * BUGFIX: `vminsert`: properly accept samples via [OpenTelemetry data ingestion protocol](https://docs.victoriametrics.com/#sending-data-via-opentelemetry) when these samples have no [resource attributes](https://opentelemetry.io/docs/instrumentation/go/resources/). Previously such samples were silently skipped. * BUGFIX: `vmstorage`: added missing `-inmemoryDataFlushInterval` command-line flag, which was missing in [VictoriaMetrics cluster](https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html) after implementing [this feature](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3337) in [v1.85.0](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.85.0). * BUGFIX: [vmalert](https://docs.victoriametrics.com/vmalert.html): check `-external.url` schema when starting vmalert, must be `http` or `https`. Before, alertmanager could reject alert notifications if `-external.url` contained no or wrong schema.