app/vmselect: sync query stats handling with cluster version

2025-01-20 07:19:17 +01:00 · 2020-12-27 12:53:50 +02:00 · 2020-12-27 12:53:50 +02:00 · 4b7105a65b
commit 4b7105a65b
parent df0309eae0
4 changed files with 42 additions and 36 deletions
--- a/app/vmselect/main.go
+++ b/app/vmselect/main.go
@ -198,14 +198,6 @@ func RequestHandler(w http.ResponseWriter, r *http.Request) bool {
 			return true
 		}
 		return true
-	case "/api/v1/status/top_queries":
-		topQueriesRequests.Inc()
-		if err := prometheus.QueryStatsHandler(startTime, w, r); err != nil {
-			topQueriesErrors.Inc()
-			sendPrometheusError(w, r, fmt.Errorf("cannot query status endpoint: %w", err))
-			return true
-		}
-		return true
 	case "/api/v1/status/tsdb":
 		statusTSDBRequests.Inc()
 		if err := prometheus.TSDBStatusHandler(startTime, w, r); err != nil {
@ -218,6 +210,14 @@ func RequestHandler(w http.ResponseWriter, r *http.Request) bool {
 		statusActiveQueriesRequests.Inc()
 		promql.WriteActiveQueries(w)
 		return true
+	case "/api/v1/status/top_queries":
+		topQueriesRequests.Inc()
+		if err := prometheus.QueryStatsHandler(startTime, w, r); err != nil {
+			topQueriesErrors.Inc()
+			sendPrometheusError(w, r, fmt.Errorf("cannot query status endpoint: %w", err))
+			return true
+		}
+		return true
 	case "/api/v1/export":
 		exportRequests.Inc()
 		if err := prometheus.ExportHandler(startTime, w, r); err != nil {
@ -424,14 +424,14 @@ var (
 	labelsCountRequests = metrics.NewCounter(`vm_http_requests_total{path="/api/v1/labels/count"}`)
 	labelsCountErrors   = metrics.NewCounter(`vm_http_request_errors_total{path="/api/v1/labels/count"}`)

-	topQueriesRequests = metrics.NewCounter(`vm_http_requests_total{path="/api/v1/status/top_queries"}`)
-	topQueriesErrors   = metrics.NewCounter(`vm_http_request_errors_total{path="/api/v1/status/top_queries"}`)
-
 	statusTSDBRequests = metrics.NewCounter(`vm_http_requests_total{path="/api/v1/status/tsdb"}`)
 	statusTSDBErrors   = metrics.NewCounter(`vm_http_request_errors_total{path="/api/v1/status/tsdb"}`)

 	statusActiveQueriesRequests = metrics.NewCounter(`vm_http_requests_total{path="/api/v1/status/active_queries"}`)

+	topQueriesRequests = metrics.NewCounter(`vm_http_requests_total{path="/api/v1/status/top_queries"}`)
+	topQueriesErrors   = metrics.NewCounter(`vm_http_request_errors_total{path="/api/v1/status/top_queries"}`)
+
 	deleteRequests = metrics.NewCounter(`vm_http_requests_total{path="/api/v1/admin/tsdb/delete_series"}`)
 	deleteErrors   = metrics.NewCounter(`vm_http_request_errors_total{path="/api/v1/admin/tsdb/delete_series"}`)

--- a/app/vmselect/prometheus/prometheus.go
+++ b/app/vmselect/prometheus/prometheus.go
@ -1270,10 +1270,11 @@ func QueryStatsHandler(startTime time.Time, w http.ResponseWriter, r *http.Reque
 	if err != nil {
 		return fmt.Errorf("cannot parse `maxLifetime` arg: %w", err)
 	}
+	maxLifetime := time.Duration(maxLifetimeMsecs) * time.Millisecond
 	w.Header().Set("Content-Type", "application/json; charset=utf-8")
 	bw := bufferedwriter.Get(w)
 	defer bufferedwriter.Put(bw)
-	querystats.WriteJSONQueryStats(bw, topN, time.Duration(maxLifetimeMsecs)*time.Millisecond)
+	querystats.WriteJSONQueryStats(bw, topN, maxLifetime)
 	if err := bw.Flush(); err != nil {
 		return err
 	}
--- a/app/vmselect/querystats/querystats.go
+++ b/app/vmselect/querystats/querystats.go
@ -128,19 +128,29 @@ func (qst *queryStatsTracker) registerQuery(query string, timeRangeMsecs int64,
 	r.duration = duration
 }

+func (r *queryStatRecord) matches(currentTime time.Time, maxLifetime time.Duration) bool {
+	if r.query == "" || currentTime.Sub(r.registerTime) > maxLifetime {
+		return false
+	}
+	return true
+}
+
+func (r *queryStatRecord) key() queryStatKey {
+	return queryStatKey{
+		query:         r.query,
+		timeRangeSecs: r.timeRangeSecs,
+	}
+}
+
 func (qst *queryStatsTracker) getTopByCount(topN int, maxLifetime time.Duration) []queryStatByCount {
 	currentTime := time.Now()
 	qst.mu.Lock()
 	m := make(map[queryStatKey]int)
 	for _, r := range qst.a {
-		if r.query == "" || currentTime.Sub(r.registerTime) > maxLifetime {
-			continue
+		if r.matches(currentTime, maxLifetime) {
+			k := r.key()
+			m[k] = m[k] + 1
 		}
-		k := queryStatKey{
-			query:         r.query,
-			timeRangeSecs: r.timeRangeSecs,
-		}
-		m[k] = m[k] + 1
 	}
 	qst.mu.Unlock()

@ -176,17 +186,13 @@ func (qst *queryStatsTracker) getTopByAvgDuration(topN int, maxLifetime time.Dur
 	}
 	m := make(map[queryStatKey]countSum)
 	for _, r := range qst.a {
-		if r.query == "" || currentTime.Sub(r.registerTime) > maxLifetime {
-			continue
+		if r.matches(currentTime, maxLifetime) {
+			k := r.key()
+			ks := m[k]
+			ks.count++
+			ks.sum += r.duration
+			m[k] = ks
 		}
-		k := queryStatKey{
-			query:         r.query,
-			timeRangeSecs: r.timeRangeSecs,
-		}
-		ks := m[k]
-		ks.count++
-		ks.sum += r.duration
-		m[k] = ks
 	}
 	qst.mu.Unlock()

@ -218,14 +224,10 @@ func (qst *queryStatsTracker) getTopBySumDuration(topN int, maxLifetime time.Dur
 	qst.mu.Lock()
 	m := make(map[queryStatKey]time.Duration)
 	for _, r := range qst.a {
-		if r.query == "" || currentTime.Sub(r.registerTime) > maxLifetime {
-			continue
+		if r.matches(currentTime, maxLifetime) {
+			k := r.key()
+			m[k] = m[k] + r.duration
 		}
-		k := queryStatKey{
-			query:         r.query,
-			timeRangeSecs: r.timeRangeSecs,
-		}
-		m[k] = m[k] + r.duration
 	}
 	qst.mu.Unlock()

--- a/docs/Cluster-VictoriaMetrics.md
+++ b/docs/Cluster-VictoriaMetrics.md
@ -198,6 +198,7 @@ or [an alternative dashboard for VictoriaMetrics cluster](https://grafana.com/gr
      and `YYYY-MM-DD` is the date for collecting the stats. By default the stats is collected for the current day.
    - `api/v1/status/active_queries` - for currently executed active queries. Note that every `vmselect` maintains an independent list of active queries,
      which is returned in the response.
+    - `api/v1/status/top_queries` - for listing the most frequently executed queries and queries taking the most duration.

 * URLs for [Graphite Metrics API](https://graphite-api.readthedocs.io/en/latest/api.html#the-metrics-api): `http://<vmselect>:8481/select/<accountID>/graphite/<suffix>`, where:
    - `<accountID>` is an arbitrary number identifying data namespace for query (aka tenant)
@ -214,6 +215,8 @@ or [an alternative dashboard for VictoriaMetrics cluster](https://grafana.com/gr
      - `tags/autoComplete/values` - returns tag values matching the given `valuePrefix` and/or `expr`. See [these docs](https://graphite.readthedocs.io/en/stable/tags.html#auto-complete-support).
      - `tags/delSeries` - deletes series matching the given `path`. See [these docs](https://graphite.readthedocs.io/en/stable/tags.html#removing-series-from-the-tagdb).

+* URL for query stats across all tenants: `http://<vmselect>:8481/api/v1/status/top_queries`. It lists with the most frequently executed queries and queries taking the most duration.
+
 * URL for time series deletion: `http://<vmselect>:8481/delete/<accountID>/prometheus/api/v1/admin/tsdb/delete_series?match[]=<timeseries_selector_for_delete>`.
  Note that the `delete_series` handler should be used only in exceptional cases such as deletion of accidentally ingested incorrect time series. It shouldn't
  be used on a regular basis, since it carries non-zero overhead.