diff --git a/README.md b/README.md index 7281b22c14..da8df3bf35 100644 --- a/README.md +++ b/README.md @@ -198,6 +198,7 @@ or [an alternative dashboard for VictoriaMetrics cluster](https://grafana.com/gr and `YYYY-MM-DD` is the date for collecting the stats. By default the stats is collected for the current day. - `api/v1/status/active_queries` - for currently executed active queries. Note that every `vmselect` maintains an independent list of active queries, which is returned in the response. + - `api/v1/status/top_queries` - for listing the most frequently executed queries and queries taking the most duration. * URLs for [Graphite Metrics API](https://graphite-api.readthedocs.io/en/latest/api.html#the-metrics-api): `http://:8481/select//graphite/`, where: - `` is an arbitrary number identifying data namespace for query (aka tenant) @@ -214,7 +215,7 @@ or [an alternative dashboard for VictoriaMetrics cluster](https://grafana.com/gr - `tags/autoComplete/values` - returns tag values matching the given `valuePrefix` and/or `expr`. See [these docs](https://graphite.readthedocs.io/en/stable/tags.html#auto-complete-support). - `tags/delSeries` - deletes series matching the given `path`. See [these docs](https://graphite.readthedocs.io/en/stable/tags.html#removing-series-from-the-tagdb). -* URL for query stats across all tenants: `http://:8481/api/v1/status/top_queries`. It returns query lists with the most frequently executed queries and queries taking the most duration. +* URL for query stats across all tenants: `http://:8481/api/v1/status/top_queries`. It lists with the most frequently executed queries and queries taking the most duration. * URL for time series deletion: `http://:8481/delete//prometheus/api/v1/admin/tsdb/delete_series?match[]=`. Note that the `delete_series` handler should be used only in exceptional cases such as deletion of accidentally ingested incorrect time series. It shouldn't diff --git a/app/vmselect/main.go b/app/vmselect/main.go index af8e78b85a..23e58659ba 100644 --- a/app/vmselect/main.go +++ b/app/vmselect/main.go @@ -172,10 +172,10 @@ func requestHandler(w http.ResponseWriter, r *http.Request) bool { return true } if path == "/api/v1/status/top_queries" { - topQueriesRequests.Inc() - if err := prometheus.QueryStatsHandler(startTime, w, r); err != nil { - topQueriesErrors.Inc() - sendPrometheusError(w, r, fmt.Errorf("cannot query status endpoint: %w", err)) + globalTopQueriesRequests.Inc() + if err := prometheus.QueryStatsHandler(startTime, nil, w, r); err != nil { + globalTopQueriesErrors.Inc() + sendPrometheusError(w, r, err) return true } return true @@ -295,6 +295,14 @@ func selectHandler(startTime time.Time, w http.ResponseWriter, r *http.Request, statusActiveQueriesRequests.Inc() promql.WriteActiveQueries(w) return true + case "prometheus/api/v1/status/top_queries": + topQueriesRequests.Inc() + if err := prometheus.QueryStatsHandler(startTime, at, w, r); err != nil { + topQueriesErrors.Inc() + sendPrometheusError(w, r, err) + return true + } + return true case "prometheus/api/v1/export": exportRequests.Inc() if err := prometheus.ExportHandler(startTime, at, w, r); err != nil { @@ -501,11 +509,14 @@ var ( statusTSDBRequests = metrics.NewCounter(`vm_http_requests_total{path="/select/{}/prometheus/api/v1/status/tsdb"}`) statusTSDBErrors = metrics.NewCounter(`vm_http_request_errors_total{path="/select/{}/prometheus/api/v1/status/tsdb"}`) - topQueriesRequests = metrics.NewCounter(`vm_http_requests_total{path="/api/v1/status/top_queries"}`) - topQueriesErrors = metrics.NewCounter(`vm_http_request_errors_total{path="/api/v1/status/top_queries"}`) - statusActiveQueriesRequests = metrics.NewCounter(`vm_http_requests_total{path="/select/{}prometheus/api/v1/status/active_queries"}`) + topQueriesRequests = metrics.NewCounter(`vm_http_requests_total{path="/select/{}/prometheus/api/v1/status/top_queries"}`) + topQueriesErrors = metrics.NewCounter(`vm_http_request_errors_total{path="/select/{}/prometheus/api/v1/status/top_queries"}`) + + globalTopQueriesRequests = metrics.NewCounter(`vm_http_requests_total{path="/api/v1/status/top_queries"}`) + globalTopQueriesErrors = metrics.NewCounter(`vm_http_request_errors_total{path="/api/v1/status/top_queries"}`) + deleteRequests = metrics.NewCounter(`vm_http_requests_total{path="/delete/{}/prometheus/api/v1/admin/tsdb/delete_series"}`) deleteErrors = metrics.NewCounter(`vm_http_request_errors_total{path="/delete/{}/prometheus/api/v1/admin/tsdb/delete_series"}`) diff --git a/app/vmselect/prometheus/prometheus.go b/app/vmselect/prometheus/prometheus.go index 879ac2e69c..2fb28d29f1 100644 --- a/app/vmselect/prometheus/prometheus.go +++ b/app/vmselect/prometheus/prometheus.go @@ -1329,7 +1329,7 @@ func getLatencyOffsetMilliseconds() int64 { } // QueryStatsHandler returns query stats at `/api/v1/status/top_queries` -func QueryStatsHandler(startTime time.Time, w http.ResponseWriter, r *http.Request) error { +func QueryStatsHandler(startTime time.Time, at *auth.Token, w http.ResponseWriter, r *http.Request) error { if err := r.ParseForm(); err != nil { return fmt.Errorf("cannot parse form values: %w", err) } @@ -1346,10 +1346,15 @@ func QueryStatsHandler(startTime time.Time, w http.ResponseWriter, r *http.Reque if err != nil { return fmt.Errorf("cannot parse `maxLifetime` arg: %w", err) } + maxLifetime := time.Duration(maxLifetimeMsecs) * time.Millisecond w.Header().Set("Content-Type", "application/json; charset=utf-8") bw := bufferedwriter.Get(w) defer bufferedwriter.Put(bw) - querystats.WriteJSONQueryStats(bw, topN, time.Duration(maxLifetimeMsecs)*time.Millisecond) + if at == nil { + querystats.WriteJSONQueryStats(bw, topN, maxLifetime) + } else { + querystats.WriteJSONQueryStatsForAccountProject(bw, topN, at.AccountID, at.ProjectID, maxLifetime) + } if err := bw.Flush(); err != nil { return err } diff --git a/app/vmselect/querystats/querystats.go b/app/vmselect/querystats/querystats.go index 6772e28635..acc0509748 100644 --- a/app/vmselect/querystats/querystats.go +++ b/app/vmselect/querystats/querystats.go @@ -39,7 +39,17 @@ func RegisterQuery(accountID, projectID uint32, query string, timeRangeMsecs int // WriteJSONQueryStats writes query stats to given writer in json format. func WriteJSONQueryStats(w io.Writer, topN int, maxLifetime time.Duration) { initOnce.Do(initQueryStats) - qsTracker.writeJSONQueryStats(w, topN, maxLifetime) + qsTracker.writeJSONQueryStats(w, topN, nil, maxLifetime) +} + +// WriteJSONQueryStatsForAccountProject writes query stats for the given (accountID, projectID) to given writer in json format. +func WriteJSONQueryStatsForAccountProject(w io.Writer, topN int, accountID, projectID uint32, maxLifetime time.Duration) { + initOnce.Do(initQueryStats) + apFilter := &accountProjectFilter{ + accountID: accountID, + projectID: projectID, + } + qsTracker.writeJSONQueryStats(w, topN, apFilter, maxLifetime) } // queryStatsTracker holds statistics for queries @@ -65,6 +75,11 @@ type queryStatKey struct { timeRangeSecs int64 } +type accountProjectFilter struct { + accountID uint32 + projectID uint32 +} + func initQueryStats() { recordsCount := *lastQueriesCount if recordsCount <= 0 { @@ -78,12 +93,12 @@ func initQueryStats() { } } -func (qst *queryStatsTracker) writeJSONQueryStats(w io.Writer, topN int, maxLifetime time.Duration) { +func (qst *queryStatsTracker) writeJSONQueryStats(w io.Writer, topN int, apFilter *accountProjectFilter, maxLifetime time.Duration) { fmt.Fprintf(w, `{"topN":"%d","maxLifetime":%q,`, topN, maxLifetime) fmt.Fprintf(w, `"search.queryStats.lastQueriesCount":%d,`, *lastQueriesCount) fmt.Fprintf(w, `"search.queryStats.minQueryDuration":%q,`, *minQueryDuration) fmt.Fprintf(w, `"topByCount":[`) - topByCount := qst.getTopByCount(topN, maxLifetime) + topByCount := qst.getTopByCount(topN, apFilter, maxLifetime) for i, r := range topByCount { fmt.Fprintf(w, `{"accountID":%d,"projectID":%d,"query":%q,"timeRangeSeconds":%d,"count":%d}`, r.accountID, r.projectID, r.query, r.timeRangeSecs, r.count) if i+1 < len(topByCount) { @@ -91,7 +106,7 @@ func (qst *queryStatsTracker) writeJSONQueryStats(w io.Writer, topN int, maxLife } } fmt.Fprintf(w, `],"topByAvgDuration":[`) - topByAvgDuration := qst.getTopByAvgDuration(topN, maxLifetime) + topByAvgDuration := qst.getTopByAvgDuration(topN, apFilter, maxLifetime) for i, r := range topByAvgDuration { fmt.Fprintf(w, `{"accountID":%d,"projectID":%d,"query":%q,"timeRangeSeconds":%d,"avgDurationSeconds":%.3f}`, r.accountID, r.projectID, r.query, r.timeRangeSecs, r.duration.Seconds()) @@ -100,7 +115,7 @@ func (qst *queryStatsTracker) writeJSONQueryStats(w io.Writer, topN int, maxLife } } fmt.Fprintf(w, `],"topBySumDuration":[`) - topBySumDuration := qst.getTopBySumDuration(topN, maxLifetime) + topBySumDuration := qst.getTopBySumDuration(topN, apFilter, maxLifetime) for i, r := range topBySumDuration { fmt.Fprintf(w, `{"accountID":%d,"projectID":%d,"query":%q,"timeRangeSeconds":%d,"sumDurationSeconds":%.3f}`, r.accountID, r.projectID, r.query, r.timeRangeSecs, r.duration.Seconds()) @@ -136,21 +151,34 @@ func (qst *queryStatsTracker) registerQuery(accountID, projectID uint32, query s r.duration = duration } -func (qst *queryStatsTracker) getTopByCount(topN int, maxLifetime time.Duration) []queryStatByCount { +func (r *queryStatRecord) matches(apFilter *accountProjectFilter, currentTime time.Time, maxLifetime time.Duration) bool { + if r.query == "" || currentTime.Sub(r.registerTime) > maxLifetime { + return false + } + if apFilter != nil && (apFilter.accountID != r.accountID || apFilter.projectID != r.projectID) { + return false + } + return true +} + +func (r *queryStatRecord) key() queryStatKey { + return queryStatKey{ + accountID: r.accountID, + projectID: r.projectID, + query: r.query, + timeRangeSecs: r.timeRangeSecs, + } +} + +func (qst *queryStatsTracker) getTopByCount(topN int, apFilter *accountProjectFilter, maxLifetime time.Duration) []queryStatByCount { currentTime := time.Now() qst.mu.Lock() m := make(map[queryStatKey]int) for _, r := range qst.a { - if r.query == "" || currentTime.Sub(r.registerTime) > maxLifetime { - continue + if r.matches(apFilter, currentTime, maxLifetime) { + k := r.key() + m[k] = m[k] + 1 } - k := queryStatKey{ - accountID: r.accountID, - projectID: r.projectID, - query: r.query, - timeRangeSecs: r.timeRangeSecs, - } - m[k] = m[k] + 1 } qst.mu.Unlock() @@ -181,7 +209,7 @@ type queryStatByCount struct { count int } -func (qst *queryStatsTracker) getTopByAvgDuration(topN int, maxLifetime time.Duration) []queryStatByDuration { +func (qst *queryStatsTracker) getTopByAvgDuration(topN int, apFilter *accountProjectFilter, maxLifetime time.Duration) []queryStatByDuration { currentTime := time.Now() qst.mu.Lock() type countSum struct { @@ -190,19 +218,13 @@ func (qst *queryStatsTracker) getTopByAvgDuration(topN int, maxLifetime time.Dur } m := make(map[queryStatKey]countSum) for _, r := range qst.a { - if r.query == "" || currentTime.Sub(r.registerTime) > maxLifetime { - continue + if r.matches(apFilter, currentTime, maxLifetime) { + k := r.key() + ks := m[k] + ks.count++ + ks.sum += r.duration + m[k] = ks } - k := queryStatKey{ - accountID: r.accountID, - projectID: r.projectID, - query: r.query, - timeRangeSecs: r.timeRangeSecs, - } - ks := m[k] - ks.count++ - ks.sum += r.duration - m[k] = ks } qst.mu.Unlock() @@ -233,21 +255,15 @@ type queryStatByDuration struct { duration time.Duration } -func (qst *queryStatsTracker) getTopBySumDuration(topN int, maxLifetime time.Duration) []queryStatByDuration { +func (qst *queryStatsTracker) getTopBySumDuration(topN int, apFilter *accountProjectFilter, maxLifetime time.Duration) []queryStatByDuration { currentTime := time.Now() qst.mu.Lock() m := make(map[queryStatKey]time.Duration) for _, r := range qst.a { - if r.query == "" || currentTime.Sub(r.registerTime) > maxLifetime { - continue + if r.matches(apFilter, currentTime, maxLifetime) { + k := r.key() + m[k] = m[k] + r.duration } - k := queryStatKey{ - accountID: r.accountID, - projectID: r.projectID, - query: r.query, - timeRangeSecs: r.timeRangeSecs, - } - m[k] = m[k] + r.duration } qst.mu.Unlock() diff --git a/docs/Cluster-VictoriaMetrics.md b/docs/Cluster-VictoriaMetrics.md index 6d798c3175..da8df3bf35 100644 --- a/docs/Cluster-VictoriaMetrics.md +++ b/docs/Cluster-VictoriaMetrics.md @@ -198,6 +198,7 @@ or [an alternative dashboard for VictoriaMetrics cluster](https://grafana.com/gr and `YYYY-MM-DD` is the date for collecting the stats. By default the stats is collected for the current day. - `api/v1/status/active_queries` - for currently executed active queries. Note that every `vmselect` maintains an independent list of active queries, which is returned in the response. + - `api/v1/status/top_queries` - for listing the most frequently executed queries and queries taking the most duration. * URLs for [Graphite Metrics API](https://graphite-api.readthedocs.io/en/latest/api.html#the-metrics-api): `http://:8481/select//graphite/`, where: - `` is an arbitrary number identifying data namespace for query (aka tenant) @@ -214,6 +215,8 @@ or [an alternative dashboard for VictoriaMetrics cluster](https://grafana.com/gr - `tags/autoComplete/values` - returns tag values matching the given `valuePrefix` and/or `expr`. See [these docs](https://graphite.readthedocs.io/en/stable/tags.html#auto-complete-support). - `tags/delSeries` - deletes series matching the given `path`. See [these docs](https://graphite.readthedocs.io/en/stable/tags.html#removing-series-from-the-tagdb). +* URL for query stats across all tenants: `http://:8481/api/v1/status/top_queries`. It lists with the most frequently executed queries and queries taking the most duration. + * URL for time series deletion: `http://:8481/delete//prometheus/api/v1/admin/tsdb/delete_series?match[]=`. Note that the `delete_series` handler should be used only in exceptional cases such as deletion of accidentally ingested incorrect time series. It shouldn't be used on a regular basis, since it carries non-zero overhead.