mirror of
https://github.com/VictoriaMetrics/VictoriaMetrics.git
synced 2025-01-20 07:19:17 +01:00
app/vmselect: add -search.logQueryMemoryUsage command-line flag for logging queries, which take big amounts of memory
Thanks to @michal-kralik for initial attempts for this feature: - https://github.com/VictoriaMetrics/VictoriaMetrics/pull/3651 - https://github.com/VictoriaMetrics/VictoriaMetrics/pull/3715 Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3553
This commit is contained in:
parent
c080443fef
commit
b285207aa7
@ -35,8 +35,9 @@ var (
|
||||
maxQueueDuration = flag.Duration("search.maxQueueDuration", 10*time.Second, "The maximum time the request waits for execution when -search.maxConcurrentRequests "+
|
||||
"limit is reached; see also -search.maxQueryDuration")
|
||||
resetCacheAuthKey = flag.String("search.resetCacheAuthKey", "", "Optional authKey for resetting rollup cache via /internal/resetRollupResultCache call")
|
||||
logSlowQueryDuration = flag.Duration("search.logSlowQueryDuration", 5*time.Second, "Log queries with execution time exceeding this value. Zero disables slow query logging")
|
||||
vmalertProxyURL = flag.String("vmalert.proxyURL", "", "Optional URL for proxying requests to vmalert. For example, if -vmalert.proxyURL=http://vmalert:8880 , then alerting API requests such as /api/v1/rules from Grafana will be proxied to http://vmalert:8880/api/v1/rules")
|
||||
logSlowQueryDuration = flag.Duration("search.logSlowQueryDuration", 5*time.Second, "Log queries with execution time exceeding this value. Zero disables slow query logging. "+
|
||||
"See also -search.logQueryMemoryUsage")
|
||||
vmalertProxyURL = flag.String("vmalert.proxyURL", "", "Optional URL for proxying requests to vmalert. For example, if -vmalert.proxyURL=http://vmalert:8880 , then alerting API requests such as /api/v1/rules from Grafana will be proxied to http://vmalert:8880/api/v1/rules")
|
||||
)
|
||||
|
||||
var slowQueries = metrics.NewCounter(`vm_slow_queries_total`)
|
||||
|
@ -759,6 +759,9 @@ func QueryHandler(qt *querytracer.Tracer, startTime time.Time, w http.ResponseWr
|
||||
LookbackDelta: lookbackDelta,
|
||||
RoundDigits: getRoundDigits(r),
|
||||
EnforcedTagFilterss: etfs,
|
||||
GetRequestURI: func() string {
|
||||
return httpserver.GetRequestURI(r)
|
||||
},
|
||||
}
|
||||
result, err := promql.Exec(qt, &ec, query, true)
|
||||
if err != nil {
|
||||
@ -860,6 +863,9 @@ func queryRangeHandler(qt *querytracer.Tracer, startTime time.Time, w http.Respo
|
||||
LookbackDelta: lookbackDelta,
|
||||
RoundDigits: getRoundDigits(r),
|
||||
EnforcedTagFilterss: etfs,
|
||||
GetRequestURI: func() string {
|
||||
return httpserver.GetRequestURI(r)
|
||||
},
|
||||
}
|
||||
result, err := promql.Exec(qt, &ec, query, false)
|
||||
if err != nil {
|
||||
|
@ -30,7 +30,11 @@ var (
|
||||
"See https://valyala.medium.com/prometheus-subqueries-in-victoriametrics-9b1492b720b3")
|
||||
maxMemoryPerQuery = flagutil.NewBytes("search.maxMemoryPerQuery", 0, "The maximum amounts of memory a single query may consume. "+
|
||||
"Queries requiring more memory are rejected. The total memory limit for concurrently executed queries can be estimated "+
|
||||
"as -search.maxMemoryPerQuery multiplied by -search.maxConcurrentRequests")
|
||||
"as -search.maxMemoryPerQuery multiplied by -search.maxConcurrentRequests . "+
|
||||
"See also -search.logQueryMemoryUsage")
|
||||
logQueryMemoryUsage = flagutil.NewBytes("search.logQueryMemoryUsage", 0, "Log queries, which require more memory than specified by this flag. "+
|
||||
"This may help detecting and optimizing heavy queries. Query logging is disabled by default. "+
|
||||
"See also -search.logSlowQueryDuration and -search.maxMemoryPerQuery")
|
||||
noStaleMarkers = flag.Bool("search.noStaleMarkers", false, "Set this flag to true if the database doesn't contain Prometheus stale markers, "+
|
||||
"so there is no need in spending additional CPU time on its handling. Staleness markers may exist only in data obtained from Prometheus scrape targets")
|
||||
)
|
||||
@ -123,6 +127,10 @@ type EvalConfig struct {
|
||||
// EnforcedTagFilterss may contain additional label filters to use in the query.
|
||||
EnforcedTagFilterss [][]storage.TagFilter
|
||||
|
||||
// The callback, which returns the request URI during logging.
|
||||
// The request URI isn't stored here because its' construction may take non-trivial amounts of CPU.
|
||||
GetRequestURI func() string
|
||||
|
||||
timestamps []int64
|
||||
timestampsOnce sync.Once
|
||||
}
|
||||
@ -140,6 +148,7 @@ func copyEvalConfig(src *EvalConfig) *EvalConfig {
|
||||
ec.LookbackDelta = src.LookbackDelta
|
||||
ec.RoundDigits = src.RoundDigits
|
||||
ec.EnforcedTagFilterss = src.EnforcedTagFilterss
|
||||
ec.GetRequestURI = src.GetRequestURI
|
||||
|
||||
// do not copy src.timestamps - they must be generated again.
|
||||
return &ec
|
||||
@ -1079,26 +1088,33 @@ func evalRollupFuncWithMetricExpr(qt *querytracer.Tracer, ec *EvalConfig, funcNa
|
||||
}
|
||||
rollupPoints := mulNoOverflow(pointsPerTimeseries, int64(timeseriesLen*len(rcs)))
|
||||
rollupMemorySize = sumNoOverflow(mulNoOverflow(int64(rssLen), 1000), mulNoOverflow(rollupPoints, 16))
|
||||
if maxMemory := int64(logQueryMemoryUsage.N); maxMemory > 0 && rollupMemorySize > maxMemory {
|
||||
requestURI := ec.GetRequestURI()
|
||||
logger.Warnf("remoteAddr=%s, requestURI=%s: the %s requires %d bytes of memory for processing; "+
|
||||
"logging this query, since it exceeds the -search.logQueryMemoryUsage=%d; "+
|
||||
"the query selects %d time series and generates %d points across all the time series; try reducing the number of selected time series",
|
||||
ec.QuotedRemoteAddr, requestURI, expr.AppendString(nil), rollupMemorySize, maxMemory, timeseriesLen*len(rcs), rollupPoints)
|
||||
}
|
||||
if maxMemory := int64(maxMemoryPerQuery.N); maxMemory > 0 && rollupMemorySize > maxMemory {
|
||||
rss.Cancel()
|
||||
return nil, &UserReadableError{
|
||||
Err: fmt.Errorf("not enough memory for processing %d data points across %d time series with %d points in each time series "+
|
||||
Err: fmt.Errorf("not enough memory for processing %s, which returns %d data points across %d time series with %d points in each time series "+
|
||||
"according to -search.maxMemoryPerQuery=%d; requested memory: %d bytes; "+
|
||||
"possible solutions are: reducing the number of matching time series; increasing `step` query arg (step=%gs); "+
|
||||
"increasing -search.maxMemoryPerQuery",
|
||||
rollupPoints, timeseriesLen*len(rcs), pointsPerTimeseries, maxMemory, rollupMemorySize, float64(ec.Step)/1e3),
|
||||
expr.AppendString(nil), rollupPoints, timeseriesLen*len(rcs), pointsPerTimeseries, maxMemory, rollupMemorySize, float64(ec.Step)/1e3),
|
||||
}
|
||||
}
|
||||
rml := getRollupMemoryLimiter()
|
||||
if !rml.Get(uint64(rollupMemorySize)) {
|
||||
rss.Cancel()
|
||||
return nil, &UserReadableError{
|
||||
Err: fmt.Errorf("not enough memory for processing %d data points across %d time series with %d points in each time series; "+
|
||||
Err: fmt.Errorf("not enough memory for processing %s, which returns %d data points across %d time series with %d points in each time series; "+
|
||||
"total available memory for concurrent requests: %d bytes; "+
|
||||
"requested memory: %d bytes; "+
|
||||
"possible solutions are: reducing the number of matching time series; increasing `step` query arg (step=%gs); "+
|
||||
"switching to node with more RAM; increasing -memory.allowedPercent",
|
||||
rollupPoints, timeseriesLen*len(rcs), pointsPerTimeseries, rml.MaxSize, uint64(rollupMemorySize), float64(ec.Step)/1e3),
|
||||
expr.AppendString(nil), rollupPoints, timeseriesLen*len(rcs), pointsPerTimeseries, rml.MaxSize, uint64(rollupMemorySize), float64(ec.Step)/1e3),
|
||||
}
|
||||
}
|
||||
defer rml.Put(uint64(rollupMemorySize))
|
||||
|
@ -33,6 +33,7 @@ The following tip changes can be tested by building VictoriaMetrics components f
|
||||
* FEATURE: [MetricsQL](https://docs.victoriametrics.com/MetricsQL.html): add `range_trim_outliers(k, q)` function for dropping outliers located farther than `k*range_mad(q)` from the `range_median(q)`. This should help removing outliers during query time at [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3759).
|
||||
* FEATURE: [MetricsQL](https://docs.victoriametrics.com/MetricsQL.html): add `range_trim_zscore(z, q)` function for dropping outliers located farther than `z*range_stddev(q)` from `range_avg(q)`. This should help removing outliers during query time at [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3759).
|
||||
* FEATURE: [vmui](https://docs.victoriametrics.com/#vmui): show `median` instead of `avg` in graph tooltip and line legend, since `median` is more tolerant against spikes. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3706).
|
||||
* FEATURE: add `-search.logQueryMemoryUsage` command-line flag for logging queries, which need more memory than specified by this command-line flag. See [this feature request](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3553). Thanks to @michal-kralik for the idea and the intial implementation.
|
||||
* FEATURE: allow setting zero value for `-search.latencyOffset` command-line flag. This may be needed in [some cases](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/2061#issuecomment-1299109836). Previously the minimum supported value for `-search.latencyOffset` command-line flag was `1s`.
|
||||
|
||||
* BUGFIX: [vmagent](https://docs.victoriametrics.com/vmagent.html): immediately cancel in-flight scrape requests during configuration reload when [stream parsing mode](https://docs.victoriametrics.com/vmagent.html#stream-parsing-mode) is disabled. Previously `vmagent` could wait for long time until all the in-flight requests are completed before reloading the configuration. This could significantly slow down configuration reload. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3747).
|
||||
|
Loading…
Reference in New Issue
Block a user