app/vmselect: add per-tenant /api/v1/status/top_queries handler

This commit is contained in:
Aliaksandr Valialkin 2020-12-27 12:53:50 +02:00
parent 0e739efc88
commit 5bbf200de2
5 changed files with 84 additions and 48 deletions

View File

@ -198,6 +198,7 @@ or [an alternative dashboard for VictoriaMetrics cluster](https://grafana.com/gr
and `YYYY-MM-DD` is the date for collecting the stats. By default the stats is collected for the current day.
- `api/v1/status/active_queries` - for currently executed active queries. Note that every `vmselect` maintains an independent list of active queries,
which is returned in the response.
- `api/v1/status/top_queries` - for listing the most frequently executed queries and queries taking the most duration.
* URLs for [Graphite Metrics API](https://graphite-api.readthedocs.io/en/latest/api.html#the-metrics-api): `http://<vmselect>:8481/select/<accountID>/graphite/<suffix>`, where:
- `<accountID>` is an arbitrary number identifying data namespace for query (aka tenant)
@ -214,7 +215,7 @@ or [an alternative dashboard for VictoriaMetrics cluster](https://grafana.com/gr
- `tags/autoComplete/values` - returns tag values matching the given `valuePrefix` and/or `expr`. See [these docs](https://graphite.readthedocs.io/en/stable/tags.html#auto-complete-support).
- `tags/delSeries` - deletes series matching the given `path`. See [these docs](https://graphite.readthedocs.io/en/stable/tags.html#removing-series-from-the-tagdb).
* URL for query stats across all tenants: `http://<vmselect>:8481/api/v1/status/top_queries`. It returns query lists with the most frequently executed queries and queries taking the most duration.
* URL for query stats across all tenants: `http://<vmselect>:8481/api/v1/status/top_queries`. It lists with the most frequently executed queries and queries taking the most duration.
* URL for time series deletion: `http://<vmselect>:8481/delete/<accountID>/prometheus/api/v1/admin/tsdb/delete_series?match[]=<timeseries_selector_for_delete>`.
Note that the `delete_series` handler should be used only in exceptional cases such as deletion of accidentally ingested incorrect time series. It shouldn't

View File

@ -172,10 +172,10 @@ func requestHandler(w http.ResponseWriter, r *http.Request) bool {
return true
}
if path == "/api/v1/status/top_queries" {
topQueriesRequests.Inc()
if err := prometheus.QueryStatsHandler(startTime, w, r); err != nil {
topQueriesErrors.Inc()
sendPrometheusError(w, r, fmt.Errorf("cannot query status endpoint: %w", err))
globalTopQueriesRequests.Inc()
if err := prometheus.QueryStatsHandler(startTime, nil, w, r); err != nil {
globalTopQueriesErrors.Inc()
sendPrometheusError(w, r, err)
return true
}
return true
@ -295,6 +295,14 @@ func selectHandler(startTime time.Time, w http.ResponseWriter, r *http.Request,
statusActiveQueriesRequests.Inc()
promql.WriteActiveQueries(w)
return true
case "prometheus/api/v1/status/top_queries":
topQueriesRequests.Inc()
if err := prometheus.QueryStatsHandler(startTime, at, w, r); err != nil {
topQueriesErrors.Inc()
sendPrometheusError(w, r, err)
return true
}
return true
case "prometheus/api/v1/export":
exportRequests.Inc()
if err := prometheus.ExportHandler(startTime, at, w, r); err != nil {
@ -501,11 +509,14 @@ var (
statusTSDBRequests = metrics.NewCounter(`vm_http_requests_total{path="/select/{}/prometheus/api/v1/status/tsdb"}`)
statusTSDBErrors = metrics.NewCounter(`vm_http_request_errors_total{path="/select/{}/prometheus/api/v1/status/tsdb"}`)
topQueriesRequests = metrics.NewCounter(`vm_http_requests_total{path="/api/v1/status/top_queries"}`)
topQueriesErrors = metrics.NewCounter(`vm_http_request_errors_total{path="/api/v1/status/top_queries"}`)
statusActiveQueriesRequests = metrics.NewCounter(`vm_http_requests_total{path="/select/{}prometheus/api/v1/status/active_queries"}`)
topQueriesRequests = metrics.NewCounter(`vm_http_requests_total{path="/select/{}/prometheus/api/v1/status/top_queries"}`)
topQueriesErrors = metrics.NewCounter(`vm_http_request_errors_total{path="/select/{}/prometheus/api/v1/status/top_queries"}`)
globalTopQueriesRequests = metrics.NewCounter(`vm_http_requests_total{path="/api/v1/status/top_queries"}`)
globalTopQueriesErrors = metrics.NewCounter(`vm_http_request_errors_total{path="/api/v1/status/top_queries"}`)
deleteRequests = metrics.NewCounter(`vm_http_requests_total{path="/delete/{}/prometheus/api/v1/admin/tsdb/delete_series"}`)
deleteErrors = metrics.NewCounter(`vm_http_request_errors_total{path="/delete/{}/prometheus/api/v1/admin/tsdb/delete_series"}`)

View File

@ -1329,7 +1329,7 @@ func getLatencyOffsetMilliseconds() int64 {
}
// QueryStatsHandler returns query stats at `/api/v1/status/top_queries`
func QueryStatsHandler(startTime time.Time, w http.ResponseWriter, r *http.Request) error {
func QueryStatsHandler(startTime time.Time, at *auth.Token, w http.ResponseWriter, r *http.Request) error {
if err := r.ParseForm(); err != nil {
return fmt.Errorf("cannot parse form values: %w", err)
}
@ -1346,10 +1346,15 @@ func QueryStatsHandler(startTime time.Time, w http.ResponseWriter, r *http.Reque
if err != nil {
return fmt.Errorf("cannot parse `maxLifetime` arg: %w", err)
}
maxLifetime := time.Duration(maxLifetimeMsecs) * time.Millisecond
w.Header().Set("Content-Type", "application/json; charset=utf-8")
bw := bufferedwriter.Get(w)
defer bufferedwriter.Put(bw)
querystats.WriteJSONQueryStats(bw, topN, time.Duration(maxLifetimeMsecs)*time.Millisecond)
if at == nil {
querystats.WriteJSONQueryStats(bw, topN, maxLifetime)
} else {
querystats.WriteJSONQueryStatsForAccountProject(bw, topN, at.AccountID, at.ProjectID, maxLifetime)
}
if err := bw.Flush(); err != nil {
return err
}

View File

@ -39,7 +39,17 @@ func RegisterQuery(accountID, projectID uint32, query string, timeRangeMsecs int
// WriteJSONQueryStats writes query stats to given writer in json format.
func WriteJSONQueryStats(w io.Writer, topN int, maxLifetime time.Duration) {
initOnce.Do(initQueryStats)
qsTracker.writeJSONQueryStats(w, topN, maxLifetime)
qsTracker.writeJSONQueryStats(w, topN, nil, maxLifetime)
}
// WriteJSONQueryStatsForAccountProject writes query stats for the given (accountID, projectID) to given writer in json format.
func WriteJSONQueryStatsForAccountProject(w io.Writer, topN int, accountID, projectID uint32, maxLifetime time.Duration) {
initOnce.Do(initQueryStats)
apFilter := &accountProjectFilter{
accountID: accountID,
projectID: projectID,
}
qsTracker.writeJSONQueryStats(w, topN, apFilter, maxLifetime)
}
// queryStatsTracker holds statistics for queries
@ -65,6 +75,11 @@ type queryStatKey struct {
timeRangeSecs int64
}
type accountProjectFilter struct {
accountID uint32
projectID uint32
}
func initQueryStats() {
recordsCount := *lastQueriesCount
if recordsCount <= 0 {
@ -78,12 +93,12 @@ func initQueryStats() {
}
}
func (qst *queryStatsTracker) writeJSONQueryStats(w io.Writer, topN int, maxLifetime time.Duration) {
func (qst *queryStatsTracker) writeJSONQueryStats(w io.Writer, topN int, apFilter *accountProjectFilter, maxLifetime time.Duration) {
fmt.Fprintf(w, `{"topN":"%d","maxLifetime":%q,`, topN, maxLifetime)
fmt.Fprintf(w, `"search.queryStats.lastQueriesCount":%d,`, *lastQueriesCount)
fmt.Fprintf(w, `"search.queryStats.minQueryDuration":%q,`, *minQueryDuration)
fmt.Fprintf(w, `"topByCount":[`)
topByCount := qst.getTopByCount(topN, maxLifetime)
topByCount := qst.getTopByCount(topN, apFilter, maxLifetime)
for i, r := range topByCount {
fmt.Fprintf(w, `{"accountID":%d,"projectID":%d,"query":%q,"timeRangeSeconds":%d,"count":%d}`, r.accountID, r.projectID, r.query, r.timeRangeSecs, r.count)
if i+1 < len(topByCount) {
@ -91,7 +106,7 @@ func (qst *queryStatsTracker) writeJSONQueryStats(w io.Writer, topN int, maxLife
}
}
fmt.Fprintf(w, `],"topByAvgDuration":[`)
topByAvgDuration := qst.getTopByAvgDuration(topN, maxLifetime)
topByAvgDuration := qst.getTopByAvgDuration(topN, apFilter, maxLifetime)
for i, r := range topByAvgDuration {
fmt.Fprintf(w, `{"accountID":%d,"projectID":%d,"query":%q,"timeRangeSeconds":%d,"avgDurationSeconds":%.3f}`,
r.accountID, r.projectID, r.query, r.timeRangeSecs, r.duration.Seconds())
@ -100,7 +115,7 @@ func (qst *queryStatsTracker) writeJSONQueryStats(w io.Writer, topN int, maxLife
}
}
fmt.Fprintf(w, `],"topBySumDuration":[`)
topBySumDuration := qst.getTopBySumDuration(topN, maxLifetime)
topBySumDuration := qst.getTopBySumDuration(topN, apFilter, maxLifetime)
for i, r := range topBySumDuration {
fmt.Fprintf(w, `{"accountID":%d,"projectID":%d,"query":%q,"timeRangeSeconds":%d,"sumDurationSeconds":%.3f}`,
r.accountID, r.projectID, r.query, r.timeRangeSecs, r.duration.Seconds())
@ -136,22 +151,35 @@ func (qst *queryStatsTracker) registerQuery(accountID, projectID uint32, query s
r.duration = duration
}
func (qst *queryStatsTracker) getTopByCount(topN int, maxLifetime time.Duration) []queryStatByCount {
currentTime := time.Now()
qst.mu.Lock()
m := make(map[queryStatKey]int)
for _, r := range qst.a {
func (r *queryStatRecord) matches(apFilter *accountProjectFilter, currentTime time.Time, maxLifetime time.Duration) bool {
if r.query == "" || currentTime.Sub(r.registerTime) > maxLifetime {
continue
return false
}
k := queryStatKey{
if apFilter != nil && (apFilter.accountID != r.accountID || apFilter.projectID != r.projectID) {
return false
}
return true
}
func (r *queryStatRecord) key() queryStatKey {
return queryStatKey{
accountID: r.accountID,
projectID: r.projectID,
query: r.query,
timeRangeSecs: r.timeRangeSecs,
}
}
func (qst *queryStatsTracker) getTopByCount(topN int, apFilter *accountProjectFilter, maxLifetime time.Duration) []queryStatByCount {
currentTime := time.Now()
qst.mu.Lock()
m := make(map[queryStatKey]int)
for _, r := range qst.a {
if r.matches(apFilter, currentTime, maxLifetime) {
k := r.key()
m[k] = m[k] + 1
}
}
qst.mu.Unlock()
var a []queryStatByCount
@ -181,7 +209,7 @@ type queryStatByCount struct {
count int
}
func (qst *queryStatsTracker) getTopByAvgDuration(topN int, maxLifetime time.Duration) []queryStatByDuration {
func (qst *queryStatsTracker) getTopByAvgDuration(topN int, apFilter *accountProjectFilter, maxLifetime time.Duration) []queryStatByDuration {
currentTime := time.Now()
qst.mu.Lock()
type countSum struct {
@ -190,20 +218,14 @@ func (qst *queryStatsTracker) getTopByAvgDuration(topN int, maxLifetime time.Dur
}
m := make(map[queryStatKey]countSum)
for _, r := range qst.a {
if r.query == "" || currentTime.Sub(r.registerTime) > maxLifetime {
continue
}
k := queryStatKey{
accountID: r.accountID,
projectID: r.projectID,
query: r.query,
timeRangeSecs: r.timeRangeSecs,
}
if r.matches(apFilter, currentTime, maxLifetime) {
k := r.key()
ks := m[k]
ks.count++
ks.sum += r.duration
m[k] = ks
}
}
qst.mu.Unlock()
var a []queryStatByDuration
@ -233,22 +255,16 @@ type queryStatByDuration struct {
duration time.Duration
}
func (qst *queryStatsTracker) getTopBySumDuration(topN int, maxLifetime time.Duration) []queryStatByDuration {
func (qst *queryStatsTracker) getTopBySumDuration(topN int, apFilter *accountProjectFilter, maxLifetime time.Duration) []queryStatByDuration {
currentTime := time.Now()
qst.mu.Lock()
m := make(map[queryStatKey]time.Duration)
for _, r := range qst.a {
if r.query == "" || currentTime.Sub(r.registerTime) > maxLifetime {
continue
}
k := queryStatKey{
accountID: r.accountID,
projectID: r.projectID,
query: r.query,
timeRangeSecs: r.timeRangeSecs,
}
if r.matches(apFilter, currentTime, maxLifetime) {
k := r.key()
m[k] = m[k] + r.duration
}
}
qst.mu.Unlock()
var a []queryStatByDuration

View File

@ -198,6 +198,7 @@ or [an alternative dashboard for VictoriaMetrics cluster](https://grafana.com/gr
and `YYYY-MM-DD` is the date for collecting the stats. By default the stats is collected for the current day.
- `api/v1/status/active_queries` - for currently executed active queries. Note that every `vmselect` maintains an independent list of active queries,
which is returned in the response.
- `api/v1/status/top_queries` - for listing the most frequently executed queries and queries taking the most duration.
* URLs for [Graphite Metrics API](https://graphite-api.readthedocs.io/en/latest/api.html#the-metrics-api): `http://<vmselect>:8481/select/<accountID>/graphite/<suffix>`, where:
- `<accountID>` is an arbitrary number identifying data namespace for query (aka tenant)
@ -214,6 +215,8 @@ or [an alternative dashboard for VictoriaMetrics cluster](https://grafana.com/gr
- `tags/autoComplete/values` - returns tag values matching the given `valuePrefix` and/or `expr`. See [these docs](https://graphite.readthedocs.io/en/stable/tags.html#auto-complete-support).
- `tags/delSeries` - deletes series matching the given `path`. See [these docs](https://graphite.readthedocs.io/en/stable/tags.html#removing-series-from-the-tagdb).
* URL for query stats across all tenants: `http://<vmselect>:8481/api/v1/status/top_queries`. It lists with the most frequently executed queries and queries taking the most duration.
* URL for time series deletion: `http://<vmselect>:8481/delete/<accountID>/prometheus/api/v1/admin/tsdb/delete_series?match[]=<timeseries_selector_for_delete>`.
Note that the `delete_series` handler should be used only in exceptional cases such as deletion of accidentally ingested incorrect time series. It shouldn't
be used on a regular basis, since it carries non-zero overhead.