app/{vminsert,vmselect}: follow-up after 2b7b3293c1

- Document the change at docs/CHANGELOG.md - Set the default value for -vmstorageUserTimeout to 3 seconds. This is much better than the 0 value, which means that TCP connection to unreachable vmstorage could block for up to 16 minutes. - Document -vmstorageUserTimeout at docs/Cluster-VictoriaMetrics.md
2024-12-12 12:46:23 +01:00 · 2023-08-29 12:09:24 +02:00 · 2023-08-29 12:09:24 +02:00 · 19d61737c1
commit 19d61737c1
parent 2b7b3293c1
8 changed files with 30 additions and 20 deletions
--- a/README.md
+++ b/README.md
@ -993,7 +993,9 @@ Below is the output for `/path/to/vminsert -help`:
  -version
     Show VictoriaMetrics version
  -vmstorageDialTimeout duration
-     Timeout for establishing RPC connections from vminsert to vmstorage (default 5s)
+     Timeout for establishing RPC connections from vminsert to vmstorage. See also -vmstorageUserTimeout (default 3s)
+  -vmstorageUserTimeout duration
+     Network timeout for RPC connections from vminsert to vmstorage (Linux only). Lower values speed up re-rerouting recovery when some of vmstorage nodes become unavailable because of networking issues. Read more about TCP_USER_TIMEOUT at https://blog.cloudflare.com/when-tcp-sockets-refuse-to-die/ . See also -vmstorageDialTimeout (default 3s)
 ```

 ### List of command-line flags for vmselect
@ -1233,7 +1235,9 @@ Below is the output for `/path/to/vmselect -help`:
  -vmalert.proxyURL string
     Optional URL for proxying requests to vmalert. For example, if -vmalert.proxyURL=http://vmalert:8880 , then alerting API requests such as /api/v1/rules from Grafana will be proxied to http://vmalert:8880/api/v1/rules
  -vmstorageDialTimeout duration
-     Timeout for establishing RPC connections from vmselect to vmstorage (default 5s)
+     Timeout for establishing RPC connections from vmselect to vmstorage. See also -vmstorageUserTimeout (default 3s)
+  -vmstorageUserTimeout duration
+     Network timeout for RPC connections from vmselect to vmstorage (Linux only). Lower values reduce the maximum query durations when some vmstorage nodes become unavailable because of networking issues. Read more about TCP_USER_TIMEOUT at https://blog.cloudflare.com/when-tcp-sockets-refuse-to-die/ . See also -vmstorageDialTimeout (default 3s)
  -vmui.customDashboardsPath string
     Optional path to vmui dashboards. See https://github.com/VictoriaMetrics/VictoriaMetrics/tree/master/app/vmui/packages/vmui/public/dashboards
 ```
--- a/app/vminsert/netstorage/netstorage.go
+++ b/app/vminsert/netstorage/netstorage.go
@ -31,11 +31,12 @@ var (
 		"Higher values for -dedup.minScrapeInterval at vmselect is OK")
 	disableRerouting      = flag.Bool("disableRerouting", true, "Whether to disable re-routing when some of vmstorage nodes accept incoming data at slower speed compared to other storage nodes. Disabled re-routing limits the ingestion rate by the slowest vmstorage node. On the other side, disabled re-routing minimizes the number of active time series in the cluster during rolling restarts and during spikes in series churn rate. See also -dropSamplesOnOverload")
 	dropSamplesOnOverload = flag.Bool("dropSamplesOnOverload", false, "Whether to drop incoming samples if the destination vmstorage node is overloaded and/or unavailable. This prioritizes cluster availability over consistency, e.g. the cluster continues accepting all the ingested samples, but some of them may be dropped if vmstorage nodes are temporarily unavailable and/or overloaded. The drop of samples happens before the replication, so it's not recommended to use this flag with -replicationFactor enabled.")
-	vmstorageDialTimeout  = flag.Duration("vmstorageDialTimeout", 5*time.Second, "Timeout for establishing RPC connections from vminsert to vmstorage")
-	vmstorageUserTimeout  = flag.Duration("vmstorageUserTimeout", 0, "TCP user timeout for RPC connections from vminsert to vmstorage (Linux only). "+
-		"When greater than 0, it specifies the maximum amount of time transmitted data may remain unacknowledged before the TCP connection is closed."+
-		"Setting a low TCP user timeout allows inserts to reroute around unresponsive storage nodes faster than the full insert timeout (at least 60 seconds)."+
-		"By default, this timeout is disabled.")
+	vmstorageDialTimeout  = flag.Duration("vmstorageDialTimeout", 3*time.Second, "Timeout for establishing RPC connections from vminsert to vmstorage. "+
+		"See also -vmstorageUserTimeout")
+	vmstorageUserTimeout = flag.Duration("vmstorageUserTimeout", 3*time.Second, "Network timeout for RPC connections from vminsert to vmstorage (Linux only). "+
+		"Lower values speed up re-rerouting recovery when some of vmstorage nodes become unavailable because of networking issues. "+
+		"Read more about TCP_USER_TIMEOUT at https://blog.cloudflare.com/when-tcp-sockets-refuse-to-die/ . "+
+		"See also -vmstorageDialTimeout")
 )

 var errStorageReadOnly = errors.New("storage node is read only")
--- a/app/vmselect/netstorage/netstorage.go
+++ b/app/vmselect/netstorage/netstorage.go
@ -41,12 +41,12 @@ var (
 		"copies at vmstorage nodes. Consider enabling this setting only if all the queried data contains -replicationFactor copies in the cluster")
 	maxSamplesPerSeries  = flag.Int("search.maxSamplesPerSeries", 30e6, "The maximum number of raw samples a single query can scan per each time series. See also -search.maxSamplesPerQuery")
 	maxSamplesPerQuery   = flag.Int("search.maxSamplesPerQuery", 1e9, "The maximum number of raw samples a single query can process across all time series. This protects from heavy queries, which select unexpectedly high number of raw samples. See also -search.maxSamplesPerSeries")
-	vmstorageDialTimeout = flag.Duration("vmstorageDialTimeout", 5*time.Second, "Timeout for establishing RPC connections from vmselect to vmstorage")
-	vmstorageUserTimeout = flag.Duration("vmstorageUserTimeout", 0, "TCP user timeout for RPC connections from vmselect to vmstorage (Linux only). "+
-		"When greater than 0, it specifies the maximum amount of time transmitted data may remain unacknowledged before the TCP connection is closed. "+
-		"Setting a low TCP user timeout allows queries to ignore unresponsive storage nodes faster than the max query duration. "+
-		"By default, this timeout is disabled. "+
-		"See also -search.maxQueryDuration")
+	vmstorageDialTimeout = flag.Duration("vmstorageDialTimeout", 3*time.Second, "Timeout for establishing RPC connections from vmselect to vmstorage. "+
+		"See also -vmstorageUserTimeout")
+	vmstorageUserTimeout = flag.Duration("vmstorageUserTimeout", 3*time.Second, "Network timeout for RPC connections from vmselect to vmstorage (Linux only). "+
+		"Lower values reduce the maximum query durations when some vmstorage nodes become unavailable because of networking issues. "+
+		"Read more about TCP_USER_TIMEOUT at https://blog.cloudflare.com/when-tcp-sockets-refuse-to-die/ . "+
+		"See also -vmstorageDialTimeout")
 )

 // Result is a single timeseries result.
--- a/docs/CHANGELOG.md
+++ b/docs/CHANGELOG.md
@ -28,6 +28,7 @@ The following `tip` changes can be tested by building VictoriaMetrics components
 * FEATURE: [vmagent](https://docs.victoriametrics.com/vmagent.html): add ability to set `member num` label for all the metrics scraped by a particular `vmagent` instance in [a cluster of vmagents](https://docs.victoriametrics.com/vmagent.html#scraping-big-number-of-targets) via `-promscrape.cluster.memberLabel` command-line flag. See [these docs](https://docs.victoriametrics.com/vmagent.html#scraping-big-number-of-targets) and [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4247).
 * FEATURE: [vmagent](https://docs.victoriametrics.com/vmagent.html): do not log `unexpected EOF` when reading incoming metrics, since this error is expected and is handled during metrics' parsing. This reduces the amounts of noisy logs. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4817).
 * FEATURE: [vmagent](https://docs.victoriametrics.com/vmagent.html): retry failed write request on the closed connection immediately, without waiting for backoff. This should improve data delivery speed and reduce amount of error logs emitted by vmagent when using idle connections. See related [issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4139).
+* FEATURE: [VictoriaMetrics cluster](https://docs.victoriametrics.com/Cluster-VictoriaMetrics.html): reduce the maximum recovery time at `vmselect` and `vminsert` when some of `vmstorage` nodes become unavailable because of networking issues from 60 seconds to 3 seconds by default. The recovery time can be tuned at `vmselect` and `vminsert` nodes with `-vmstorageUserTimeout` command-line flag if needed. Thanks to @wjordan for [the pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/4423).
 * FEATURE: [vmui](https://docs.victoriametrics.com/#vmui): make the warning message more noticeable for text fields. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4848).
 * FEATURE: [vmui](https://docs.victoriametrics.com/#vmui): add button for auto-formatting PromQL/MetricsQL queries. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/4681). Thanks to @aramattamara for the [pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/4694).
 * FEATURE: [vmui](https://docs.victoriametrics.com/#vmui): improve accessibility score to 100 according to [Google's Lighthouse](https://developer.chrome.com/docs/lighthouse/accessibility/) tests.
--- a/docs/Cluster-VictoriaMetrics.md
+++ b/docs/Cluster-VictoriaMetrics.md
@ -1004,7 +1004,9 @@ Below is the output for `/path/to/vminsert -help`:
  -version
     Show VictoriaMetrics version
  -vmstorageDialTimeout duration
-     Timeout for establishing RPC connections from vminsert to vmstorage (default 5s)
+     Timeout for establishing RPC connections from vminsert to vmstorage. See also -vmstorageUserTimeout (default 3s)
+  -vmstorageUserTimeout duration
+     Network timeout for RPC connections from vminsert to vmstorage (Linux only). Lower values speed up re-rerouting recovery when some of vmstorage nodes become unavailable because of networking issues. Read more about TCP_USER_TIMEOUT at https://blog.cloudflare.com/when-tcp-sockets-refuse-to-die/ . See also -vmstorageDialTimeout (default 3s)
 ```

 ### List of command-line flags for vmselect
@ -1244,7 +1246,9 @@ Below is the output for `/path/to/vmselect -help`:
  -vmalert.proxyURL string
     Optional URL for proxying requests to vmalert. For example, if -vmalert.proxyURL=http://vmalert:8880 , then alerting API requests such as /api/v1/rules from Grafana will be proxied to http://vmalert:8880/api/v1/rules
  -vmstorageDialTimeout duration
-     Timeout for establishing RPC connections from vmselect to vmstorage (default 5s)
+     Timeout for establishing RPC connections from vmselect to vmstorage. See also -vmstorageUserTimeout (default 3s)
+  -vmstorageUserTimeout duration
+     Network timeout for RPC connections from vmselect to vmstorage (Linux only). Lower values reduce the maximum query durations when some vmstorage nodes become unavailable because of networking issues. Read more about TCP_USER_TIMEOUT at https://blog.cloudflare.com/when-tcp-sockets-refuse-to-die/ . See also -vmstorageDialTimeout (default 3s)
  -vmui.customDashboardsPath string
     Optional path to vmui dashboards. See https://github.com/VictoriaMetrics/VictoriaMetrics/tree/master/app/vmui/packages/vmui/public/dashboards
 ```
--- a/lib/netutil/conn_pool.go
+++ b/lib/netutil/conn_pool.go
@ -49,7 +49,7 @@ type connWithTimestamp struct {
 // The compression is disabled if compressionLevel <= 0.
 //
 // Call ConnPool.MustStop when the returned ConnPool is no longer needed.
-func NewConnPool(ms *metrics.Set, name, addr string, handshakeFunc handshake.Func, compressionLevel int, dialTimeout time.Duration, userTimeout time.Duration) *ConnPool {
+func NewConnPool(ms *metrics.Set, name, addr string, handshakeFunc handshake.Func, compressionLevel int, dialTimeout, userTimeout time.Duration) *ConnPool {
 	cp := &ConnPool{
 		d:                 NewTCPDialer(ms, name, addr, dialTimeout, userTimeout),
 		concurrentDialsCh: make(chan struct{}, 8),
--- a/lib/netutil/tcpdialer.go
+++ b/lib/netutil/tcpdialer.go
@ -13,7 +13,7 @@ import (
 //
 // The name is used in metric tags for the returned dialer.
 // The name must be unique among dialers.
-func NewTCPDialer(ms *metrics.Set, name, addr string, dialTimeout time.Duration, userTimeout time.Duration) *TCPDialer {
+func NewTCPDialer(ms *metrics.Set, name, addr string, dialTimeout, userTimeout time.Duration) *TCPDialer {
 	d := &TCPDialer{
 		d: &net.Dialer{
 			Timeout: dialTimeout,
@ -29,7 +29,8 @@ func NewTCPDialer(ms *metrics.Set, name, addr string, dialTimeout time.Duration,
 	}
 	d.connMetrics.init(ms, "vm_tcpdialer", name, addr)
 	if userTimeout > 0 {
-		d.d.Control = func(network, address string, c syscall.RawConn) (err error) {
+		d.d.Control = func(network, address string, c syscall.RawConn) error {
+			var err error
 			controlErr := c.Control(func(fd uintptr) {
 				err = setTCPUserTimeout(fd, userTimeout)
 			})
--- a/lib/netutil/tcpdialer_linux.go
+++ b/lib/netutil/tcpdialer_linux.go
@ -7,6 +7,5 @@ import (
 )

 func setTCPUserTimeout(fd uintptr, timeout time.Duration) error {
-	return syscall.SetsockoptInt(
-		int(fd), syscall.IPPROTO_TCP, unix.TCP_USER_TIMEOUT, int(timeout.Milliseconds()))
+	return syscall.SetsockoptInt(int(fd), syscall.IPPROTO_TCP, unix.TCP_USER_TIMEOUT, int(timeout.Milliseconds()))
 }