app/vmselect: return 503 status code when partial responses are denied and some of vmstorage nodes are temporarily unavailable

This should help detecting this case and automatic retrying the query at healthy cluster replica
in another availability zone.

This commit is needed as a preparation for automatic query retry at another backend at vmauth on 5xx errors
as described at https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4792#issuecomment-1674338561
This commit is contained in:
Aliaksandr Valialkin 2023-09-07 16:07:03 +02:00
parent 4af79504f6
commit 58326dbf25
No known key found for this signature in database
GPG Key ID: A72BEC6CD3D0DED1
2 changed files with 13 additions and 1 deletions

View File

@ -1754,6 +1754,12 @@ func (snr *storageNodesRequest) collectResults(partialResultsCounter *metrics.Co
// and the number of partial responses reach -replicationFactor, // and the number of partial responses reach -replicationFactor,
// since this means that the response is partial. // since this means that the response is partial.
snr.finishQueryTracers("cancel request because partial responses are denied and some vmstorage nodes failed to return response") snr.finishQueryTracers("cancel request because partial responses are denied and some vmstorage nodes failed to return response")
// Returns 503 status code for partial response, so the caller could retry it if needed.
err = &httpserver.ErrorWithStatusCode{
Err: err,
StatusCode: http.StatusServiceUnavailable,
}
return false, err return false, err
} }
continue continue
@ -1780,7 +1786,12 @@ func (snr *storageNodesRequest) collectResults(partialResultsCounter *metrics.Co
if len(errsPartial) == len(sns) { if len(errsPartial) == len(sns) {
// All the vmstorage nodes returned error. // All the vmstorage nodes returned error.
// Return only the first error, since it has no sense in returning all errors. // Return only the first error, since it has no sense in returning all errors.
return false, errsPartial[0] // Returns 503 status code for partial response, so the caller could retry it if needed.
err := &httpserver.ErrorWithStatusCode{
Err: errsPartial[0],
StatusCode: http.StatusServiceUnavailable,
}
return false, err
} }
// Return partial results. // Return partial results.
// This allows gracefully degrade vmselect in the case // This allows gracefully degrade vmselect in the case

View File

@ -49,6 +49,7 @@ ssue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4825) and [these
* BUGFIX: [vmui](https://docs.victoriametrics.com/#vmui): fix the bug causing render looping when switching to heatmap. * BUGFIX: [vmui](https://docs.victoriametrics.com/#vmui): fix the bug causing render looping when switching to heatmap.
* BUGFIX: [vminsert enterprise](https://docs.victoriametrics.com/enterprise.html): properly parse `/insert/multitenant/*` urls, which have been broken since [v1.93.2](#v1932). See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4947). * BUGFIX: [vminsert enterprise](https://docs.victoriametrics.com/enterprise.html): properly parse `/insert/multitenant/*` urls, which have been broken since [v1.93.2](#v1932). See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4947).
* BUGFIX: properly build production armv5 binaries for `GOARCH=arm`. This has been broken after the upgrading of Go builder to Go1.21.0. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4965). * BUGFIX: properly build production armv5 binaries for `GOARCH=arm`. This has been broken after the upgrading of Go builder to Go1.21.0. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4965).
* BUGFIX: [vmselect](https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html): return `503 Service Unavailable` status code when [partial responses](https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html#cluster-availability) are denied and some of `vmstorage` nodes are temporarily unavailable. Previously `422 Unprocessable Entiry` status code was mistakenly returned in this case, which could prevent from automatic recovery by re-sending the request to healthy cluster replica in another availability zone.
## [v1.93.3](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.93.3) ## [v1.93.3](https://github.com/VictoriaMetrics/VictoriaMetrics/releases/tag/v1.93.3)