app/vmagent: support for DNS SRV urls at -remoteWrite.url, scrape target urls and service discovery urls

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6053
This commit is contained in:
Aliaksandr Valialkin 2024-04-17 20:47:59 +02:00
parent b426d10847
commit dc326f70b4
No known key found for this signature in database
GPG Key ID: 52C003EE2BCDB9EB
7 changed files with 44 additions and 62 deletions

View File

@ -3,34 +3,15 @@ package remotewrite
import (
"context"
"net"
"sync"
"sync/atomic"
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/netutil"
"github.com/VictoriaMetrics/metrics"
)
func getStdDialer() *net.Dialer {
stdDialerOnce.Do(func() {
stdDialer = &net.Dialer{
Timeout: 30 * time.Second,
KeepAlive: 30 * time.Second,
DualStack: netutil.TCP6Enabled(),
}
})
return stdDialer
}
var (
stdDialer *net.Dialer
stdDialerOnce sync.Once
)
func statDial(ctx context.Context, _, addr string) (conn net.Conn, err error) {
network := netutil.GetTCPNetwork()
d := getStdDialer()
conn, err = d.DialContext(ctx, network, addr)
conn, err = netutil.DialMaybeSRV(ctx, network, addr)
dialsTotal.Inc()
if err != nil {
dialErrors.Inc()

View File

@ -30,9 +30,11 @@ See also [LTS releases](https://docs.victoriametrics.com/lts-releases/).
## tip
* FEATURE: [vmui](https://docs.victoriametrics.com/#vmui): in the Select component, user-entered values are now preserved on blur if they match options in the list.
* FEATURE: [vmauth](https://docs.victoriametrics.com/vmauth/): support regex matching when routing incoming requests based on HTTP [query args](https://en.wikipedia.org/wiki/Query_string) via `src_query_args` option at `url_map`. See [these docs](https://docs.victoriametrics.com/vmauth/#generic-http-proxy-for-different-backends) and [this feature request](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6070).
* FEATURE: [vmagent](https://docs.victoriametrics.com/vmagent/): support [DNS SRV](https://en.wikipedia.org/wiki/SRV_record) addresses in `-remoteWrite.url` command-line option and in scrape target urls. For example, `-remoteWrite.url=http://srv+victoria-metrics/api/v1/write` automatically resolves the `victoria-metrics` DNS SRV to a list of hostnames with TCP ports and then sends the collected metrics to these TCP addresses. See [these docs](https://docs.victoriametrics.com/vmagent/#srv-urls) and [this feature request](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6053).
* FEATURE: [vmauth](https://docs.victoriametrics.com/vmauth/): support automatic discovering and load balancing for TCP addresses behind DNS SRV addresses. These addresses can be put inside `url_prefix` urls in the form `http://srv+addr/path`, where the `addr` is the [DNS SRV](https://en.wikipedia.org/wiki/SRV_record) address, which is automatically resolved to hostnames with TCP ports. See [these docs](https://docs.victoriametrics.com/vmauth/#srv-urls) for details.
* FEATURE: [vmauth](https://docs.victoriametrics.com/vmauth/): support specifying client TLS certificates and TLS ServerName for requests to HTTPS backends. See [these docs](https://docs.victoriametrics.com/vmauth/#backend-tls-setup).
* FEATURE: [vmauth](https://docs.victoriametrics.com/vmauth/): support regex matching when routing incoming requests based on HTTP [query args](https://en.wikipedia.org/wiki/Query_string) via `src_query_args` option at `url_map`. See [these docs](https://docs.victoriametrics.com/vmauth/#generic-http-proxy-for-different-backends) and [this feature request](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6070).
* FEATURE: [vmui](https://docs.victoriametrics.com/#vmui): in the Select component, user-entered values are now preserved on blur if they match options in the list.
* BUGFIX: [vmalert](https://docs.victoriametrics.com/vmalert.html): supported any status codes from the range 200-299 from alertmanager. Previously, only 200 status code considered a successful action. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/6110).
* BUGFIX: [vmauth](https://docs.victoriametrics.com/vmauth/): don't treat concurrency limit hit as an error of the backend. Previously, hitting the concurrency limit would increment both `vmauth_concurrent_requests_limit_reached_total` and `vmauth_user_request_backend_errors_total` counters. Now, only concurrency limit counter is incremented. Updates [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5565).

View File

@ -68,6 +68,7 @@ and sending the data to the Prometheus-compatible remote storage:
to run `vmagent` with `-promscrape.config.strictParse=false` command-line flag.
In this case `vmagent` ignores unsupported sections. See [the list of unsupported sections](#unsupported-prometheus-config-sections).
* `-remoteWrite.url` with Prometheus-compatible remote storage endpoint such as VictoriaMetrics, where to send the data to.
The `-remoteWrite.url` may refer to [DNS SRV](https://en.wikipedia.org/wiki/SRV_record) address. See [these docs](#srv-urls) for details.
Example command for writing the data received via [supported push-based protocols](#how-to-push-data-to-vmagent)
to [single-node VictoriaMetrics](https://docs.victoriametrics.com/) located at `victoria-metrics-host:8428`:
@ -129,7 +130,7 @@ additionally to pull-based Prometheus-compatible targets' scraping:
* Sending HTTP request to `http://vmagent:8429/-/reload` endpoint. This endpoint can be protected with `-reloadAuthKey` command-line flag.
There is also `-promscrape.configCheckInterval` command-line option, which can be used for automatic reloading configs from updated `-promscrape.config` file.
There is also `-promscrape.configCheckInterval` command-line flag, which can be used for automatic reloading configs from updated `-promscrape.config` file.
## Use cases
@ -272,6 +273,24 @@ for the collected samples. Examples:
./vmagent -remoteWrite=http://remote-storage/api/v1/write -streamAggr.dropInputLabels=replica -remoteWrite.streamAggr.dedupInterval=60s
```
## SRV urls
If `vmagent` encounters urls with `srv+` prefix in hostname (such as `http://srv+some-addr/some/path`), then it resolves `some-addr` [DNS SRV](https://en.wikipedia.org/wiki/SRV_record)
record into TCP address with hostname and TCP port, and then uses the resulting url when it needs connecting to it.
SRV urls are supported in the following places:
- In `-remoteWrite.url` command-line flags. For example, if `victoria-metrics` [DNS SRV](https://en.wikipedia.org/wiki/SRV_record) record contains
`victoria-metrics-host:8428` TCP address, then `-remoteWrite.url=http://srv+victoria-metrics/api/v1/write` is automatically resolved into
`-remoteWrite.url=http://victoria-metrics-host:8428/api/v1/write`. If the DNS SRV record is resolved into multiple TCP addresses, then `vmauth`
uses randomly chosen address per each connection it establishes to the remote storage.
- In scrape target addresses aka `__address__` label - see [these docs](https://docs.victoriametrics.com/relabeling/#how-to-modify-scrape-urls-in-targets) for details.
- In urls used for [service discovery](https://docs.victoriametrics.com/sd_configs/).
SRV urls are useful when HTTP services run on different TCP ports or when they can change TCP ports over time (for instance, after the restart).
## VictoriaMetrics remote write protocol
`vmagent` supports sending data to the configured `-remoteWrite.url` either via Prometheus remote write protocol
@ -419,7 +438,7 @@ There is no need in specifying top-level `scrape_configs` section in these files
The list of supported service discovery types is available [here](#how-to-collect-metrics-in-prometheus-format).
Additionally, `vmagent` doesn't support `refresh_interval` option at service discovery sections.
This option is substituted with `-promscrape.*CheckInterval` command-line options, which are specific per each service discovery type.
This option is substituted with `-promscrape.*CheckInterval` command-line flags, which are specific per each service discovery type.
See [the full list of command-line flags for vmagent](#advanced-usage).
## Adding labels to metrics
@ -506,7 +525,7 @@ and attaches `instance`, `job` and other target-specific labels to these metrics
sum_over_time(scrape_series_added[1h]) > 1000
```
`vmagent` sets `scrape_series_added` to zero when it runs with `-promscrape.noStaleMarkers` command-line option
`vmagent` sets `scrape_series_added` to zero when it runs with `-promscrape.noStaleMarkers` command-line flag
or when it scrapes target with `no_stale_markers: true` option, e.g. when [staleness markers](#prometheus-staleness-markers) are disabled.
* `scrape_series_limit` - the limit on the number of unique time series the given target can expose according to [these docs](#cardinality-limiter).
@ -1117,14 +1136,14 @@ If you have suggestions for improvements or have found a bug - please open an is
as `vmagent` establishes at least a single TCP connection per target.
* If `vmagent` uses too big amounts of memory, then the following options can help:
* Reducing the amounts of RAM vmagent can use for in-memory buffering with `-memory.allowedPercent` or `-memory.allowedBytes` command-line option.
* Reducing the amounts of RAM vmagent can use for in-memory buffering with `-memory.allowedPercent` or `-memory.allowedBytes` command-line flag.
Another option is to reduce memory limits in Docker and/or Kubernetes if `vmagent` runs under these systems.
* Reducing the number of CPU cores vmagent can use by passing `GOMAXPROCS=N` environment variable to `vmagent`,
where `N` is the desired limit on CPU cores. Another option is to reduce CPU limits in Docker or Kubernetes if `vmagent` runs under these systems.
* Disabling staleness tracking with `-promscrape.noStaleMarkers` option. See [these docs](#prometheus-staleness-markers).
* Enabling stream parsing mode if `vmagent` scrapes targets with millions of metrics per target. See [these docs](#stream-parsing-mode).
* Reducing the number of tcp connections to remote storage systems with `-remoteWrite.queues` command-line option.
* Passing `-promscrape.dropOriginalLabels` command-line option to `vmagent` if it [discovers](https://docs.victoriametrics.com/sd_configs.html)
* Reducing the number of tcp connections to remote storage systems with `-remoteWrite.queues` command-line flag.
* Passing `-promscrape.dropOriginalLabels` command-line flag to `vmagent` if it [discovers](https://docs.victoriametrics.com/sd_configs.html)
big number of targets and many of these targets are [dropped](https://docs.victoriametrics.com/relabeling.html#how-to-drop-discovered-targets)
before scraping. In this case `vmagent` drops `"discoveredLabels"` and `"droppedTargets"`
lists at `http://vmagent-host:8429/service-discovery` page. This reduces memory usage when scraping big number of targets at the cost
@ -1142,7 +1161,7 @@ If you have suggestions for improvements or have found a bug - please open an is
may result in increased memory usage if a big number of scrape targets are dropped during relabeling.
* It is recommended increaseing `-remoteWrite.queues` if `vmagent_remotewrite_pending_data_bytes` [metric](#monitoring)
grows constantly. It is also recommended increasing `-remoteWrite.maxBlockSize` and `-remoteWrite.maxRowsPerBlock` command-line options in this case.
grows constantly. It is also recommended increasing `-remoteWrite.maxBlockSize` and `-remoteWrite.maxRowsPerBlock` command-line flags in this case.
This can improve data ingestion performance to the configured remote storage systems at the cost of higher memory usage.
* If you see gaps in the data pushed by `vmagent` to remote storage when `-remoteWrite.maxDiskUsagePerURL` is set,
@ -1387,7 +1406,7 @@ See how to request a free trial license [here](https://victoriametrics.com/produ
### Reading metrics from Kafka
[Enterprise version](https://docs.victoriametrics.com/enterprise/) of `vmagent` can read metrics in various formats from Kafka messages.
These formats can be configured with `-kafka.consumer.topic.defaultFormat` or `-kafka.consumer.topic.format` command-line options. The following formats are supported:
These formats can be configured with `-kafka.consumer.topic.defaultFormat` or `-kafka.consumer.topic.format` command-line flags. The following formats are supported:
* `promremotewrite` - [Prometheus remote_write](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#remote_write).
Messages in this format can be sent by vmagent - see [these docs](#writing-metrics-to-kafka).

View File

@ -26,6 +26,10 @@ func IsTrivialNetworkError(err error) bool {
func DialMaybeSRV(ctx context.Context, network, addr string) (net.Conn, error) {
if strings.HasPrefix(addr, "srv+") {
addr = strings.TrimPrefix(addr, "srv+")
if n := strings.IndexByte(addr, ':'); n >= 0 {
// Drop port, since it should be automatically resolved via DNS SRV lookup below.
addr = addr[:n]
}
_, addrs, err := Resolver.LookupSRV(ctx, "", "", addr)
if err != nil {
return nil, fmt.Errorf("cannot resolve SRV addr %s: %w", addr, err)

View File

@ -10,6 +10,7 @@ import (
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/logger"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/netutil"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/promscrape/discoveryutils"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/promutils"
)
@ -74,7 +75,7 @@ func getMXAddrLabels(ctx context.Context, sdc *SDConfig) []*promutils.Labels {
ch := make(chan result, len(sdc.Names))
for _, name := range sdc.Names {
go func(name string) {
mx, err := resolver.LookupMX(ctx, name)
mx, err := netutil.Resolver.LookupMX(ctx, name)
ch <- result{
name: name,
mx: mx,
@ -109,7 +110,7 @@ func getSRVAddrLabels(ctx context.Context, sdc *SDConfig) []*promutils.Labels {
ch := make(chan result, len(sdc.Names))
for _, name := range sdc.Names {
go func(name string) {
_, as, err := resolver.LookupSRV(ctx, "", "", name)
_, as, err := netutil.Resolver.LookupSRV(ctx, "", "", name)
ch <- result{
name: name,
as: as,
@ -148,7 +149,7 @@ func getAAddrLabels(ctx context.Context, sdc *SDConfig, lookupType string) ([]*p
ch := make(chan result, len(sdc.Names))
for _, name := range sdc.Names {
go func(name string) {
ips, err := resolver.LookupIPAddr(ctx, name)
ips, err := netutil.Resolver.LookupIPAddr(ctx, name)
ch <- result{
name: name,
ips: ips,
@ -192,8 +193,3 @@ func appendAddrLabels(ms []*promutils.Labels, name, target string, port int) []*
m.Add("__meta_dns_srv_record_port", strconv.Itoa(port))
return append(ms, m)
}
var resolver = &net.Resolver{
PreferGo: true,
StrictErrors: true,
}

View File

@ -15,6 +15,7 @@ import (
"github.com/VictoriaMetrics/metrics"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/netutil"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/promauth"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/proxy"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/timerpool"
@ -87,8 +88,6 @@ func (hc *HTTPClient) stop() {
hc.client.CloseIdleConnections()
}
var defaultDialer = &net.Dialer{}
// NewClient returns new Client for the given args.
func NewClient(apiServer string, ac *promauth.Config, proxyURL *proxy.URL, proxyAC *promauth.Config, httpCfg *promauth.HTTPClientConfig) (*Client, error) {
u, err := url.Parse(apiServer)
@ -96,13 +95,13 @@ func NewClient(apiServer string, ac *promauth.Config, proxyURL *proxy.URL, proxy
return nil, fmt.Errorf("cannot parse apiServer=%q: %w", apiServer, err)
}
dialFunc := defaultDialer.DialContext
dialFunc := netutil.DialMaybeSRV
if u.Scheme == "unix" {
// special case for unix socket connection
dialAddr := u.Path
apiServer = "http://unix"
dialFunc = func(ctx context.Context, _, _ string) (net.Conn, error) {
return defaultDialer.DialContext(ctx, "unix", dialAddr)
return netutil.Dialer.DialContext(ctx, "unix", dialAddr)
}
}

View File

@ -6,18 +6,15 @@ import (
"net"
"strconv"
"strings"
"sync"
"sync/atomic"
"time"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/netutil"
"github.com/VictoriaMetrics/metrics"
)
func statStdDial(ctx context.Context, _, addr string) (net.Conn, error) {
d := getStdDialer()
network := netutil.GetTCPNetwork()
conn, err := d.DialContext(ctx, network, addr)
conn, err := netutil.DialMaybeSRV(ctx, network, addr)
dialsTotal.Inc()
if err != nil {
dialErrors.Inc()
@ -33,22 +30,6 @@ func statStdDial(ctx context.Context, _, addr string) (net.Conn, error) {
return sc, nil
}
func getStdDialer() *net.Dialer {
stdDialerOnce.Do(func() {
stdDialer = &net.Dialer{
Timeout: 30 * time.Second,
KeepAlive: 30 * time.Second,
DualStack: netutil.TCP6Enabled(),
}
})
return stdDialer
}
var (
stdDialer *net.Dialer
stdDialerOnce sync.Once
)
var (
dialsTotal = metrics.NewCounter(`vm_promscrape_dials_total`)
dialErrors = metrics.NewCounter(`vm_promscrape_dial_errors_total`)