From 364db13c9c32999b2d0e9f0531773672408a0176 Mon Sep 17 00:00:00 2001 From: Aliaksandr Valialkin Date: Wed, 22 Apr 2020 19:57:36 +0300 Subject: [PATCH] app/vmselect: add `/api/v1/status/tsdb` page with useful stats for locating root cause for high cardinality issues See https://prometheus.io/docs/prometheus/latest/querying/api/#tsdb-stats Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/425 Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/268 --- README.md | 7 + app/vmselect/main.go | 13 +- app/vmselect/netstorage/netstorage.go | 9 + app/vmselect/prometheus/prometheus.go | 47 +++- .../prometheus/tsdb_status_response.qtpl | 28 ++ .../prometheus/tsdb_status_response.qtpl.go | 123 +++++++++ app/vmstorage/main.go | 8 + docs/Single-server-VictoriaMetrics.md | 7 + lib/storage/index_db.go | 258 +++++++++++++++--- lib/storage/index_db_test.go | 64 +++++ lib/storage/storage.go | 7 + 11 files changed, 534 insertions(+), 37 deletions(-) create mode 100644 app/vmselect/prometheus/tsdb_status_response.qtpl create mode 100644 app/vmselect/prometheus/tsdb_status_response.qtpl.go diff --git a/README.md b/README.md index 025aae1c8..ef5ebe021 100644 --- a/README.md +++ b/README.md @@ -497,6 +497,7 @@ VictoriaMetrics supports the following handlers from [Prometheus querying API](h * [/api/v1/series](https://prometheus.io/docs/prometheus/latest/querying/api/#finding-series-by-label-matchers) * [/api/v1/labels](https://prometheus.io/docs/prometheus/latest/querying/api/#getting-label-names) * [/api/v1/label/.../values](https://prometheus.io/docs/prometheus/latest/querying/api/#querying-label-values) +* [/api/v1/status/tsdb](https://prometheus.io/docs/prometheus/latest/querying/api/#tsdb-stats) These handlers can be queried from Prometheus-compatible clients such as Grafana or curl. @@ -926,6 +927,12 @@ The most interesting metrics are: If the gaps are related to irregular intervals between samples, then try adjusting `-search.minStalenessInterval` command-line flag to value close to the maximum interval between samples. +* Metrics and labels leading to high cardinality or high churn rate can be determined at `/api/v1/status/tsdb` page. + See [these docs](https://prometheus.io/docs/prometheus/latest/querying/api/#tsdb-stats) for details. + VictoriaMetrics accepts optional `date=YYYY-MM-DD` and `topN=42` args on this page. By default `date` equals to the current date, + while `topN` equals to 10. + + ### Backfilling VictoriaMetrics accepts historical data in arbitrary order of time via [any supported ingestion method](#how-to-import-time-series-data). diff --git a/app/vmselect/main.go b/app/vmselect/main.go index aed69d190..1f00c6255 100644 --- a/app/vmselect/main.go +++ b/app/vmselect/main.go @@ -179,6 +179,14 @@ func RequestHandler(w http.ResponseWriter, r *http.Request) bool { return true } return true + case "/api/v1/status/tsdb": + tsdbStatusRequests.Inc() + if err := prometheus.TSDBStatusHandler(startTime, w, r); err != nil { + tsdbStatusErrors.Inc() + sendPrometheusError(w, r, err) + return true + } + return true case "/api/v1/export": exportRequests.Inc() if err := prometheus.ExportHandler(startTime, w, r); err != nil { @@ -191,7 +199,7 @@ func RequestHandler(w http.ResponseWriter, r *http.Request) bool { federateRequests.Inc() if err := prometheus.FederateHandler(startTime, w, r); err != nil { federateErrors.Inc() - httpserver.Errorf(w, "error int %q: %s", r.URL.Path, err) + httpserver.Errorf(w, "error in %q: %s", r.URL.Path, err) return true } return true @@ -266,6 +274,9 @@ var ( labelsCountRequests = metrics.NewCounter(`vm_http_requests_total{path="/api/v1/labels/count"}`) labelsCountErrors = metrics.NewCounter(`vm_http_request_errors_total{path="/api/v1/labels/count"}`) + tsdbStatusRequests = metrics.NewCounter(`vm_http_requests_total{path="/api/v1/status/tsdb"}`) + tsdbStatusErrors = metrics.NewCounter(`vm_http_request_errors_total{path="/api/v1/status/tsdb"}`) + deleteRequests = metrics.NewCounter(`vm_http_requests_total{path="/api/v1/admin/tsdb/delete_series"}`) deleteErrors = metrics.NewCounter(`vm_http_request_errors_total{path="/api/v1/admin/tsdb/delete_series"}`) diff --git a/app/vmselect/netstorage/netstorage.go b/app/vmselect/netstorage/netstorage.go index 40488de35..d32767381 100644 --- a/app/vmselect/netstorage/netstorage.go +++ b/app/vmselect/netstorage/netstorage.go @@ -449,6 +449,15 @@ func GetLabelEntries(deadline Deadline) ([]storage.TagEntry, error) { return labelEntries, nil } +// GetTSDBStatusForDate returns tsdb status according to https://prometheus.io/docs/prometheus/latest/querying/api/#tsdb-stats +func GetTSDBStatusForDate(deadline Deadline, date uint64, topN int) (*storage.TSDBStatus, error) { + status, err := vmstorage.GetTSDBStatusForDate(date, topN) + if err != nil { + return nil, fmt.Errorf("error during tsdb status request: %s", err) + } + return status, nil +} + // GetSeriesCount returns the number of unique series. func GetSeriesCount(deadline Deadline) (uint64, error) { n, err := vmstorage.GetSeriesCount() diff --git a/app/vmselect/prometheus/prometheus.go b/app/vmselect/prometheus/prometheus.go index c88a1b8bb..9cf755f45 100644 --- a/app/vmselect/prometheus/prometheus.go +++ b/app/vmselect/prometheus/prometheus.go @@ -377,7 +377,6 @@ func LabelsCountHandler(startTime time.Time, w http.ResponseWriter, r *http.Requ if err != nil { return fmt.Errorf(`cannot obtain label entries: %s`, err) } - w.Header().Set("Content-Type", "application/json") WriteLabelsCountResponse(w, labelEntries) labelsCountDuration.UpdateDuration(startTime) @@ -386,6 +385,52 @@ func LabelsCountHandler(startTime time.Time, w http.ResponseWriter, r *http.Requ var labelsCountDuration = metrics.NewSummary(`vm_request_duration_seconds{path="/api/v1/labels/count"}`) +const secsPerDay = 3600 * 24 + +// TSDBStatusHandler processes /api/v1/status/tsdb request. +// +// See https://prometheus.io/docs/prometheus/latest/querying/api/#tsdb-stats +func TSDBStatusHandler(startTime time.Time, w http.ResponseWriter, r *http.Request) error { + deadline := getDeadlineForQuery(r) + if err := r.ParseForm(); err != nil { + return fmt.Errorf("cannot parse form values: %s", err) + } + date := time.Now().Unix() / secsPerDay + dateStr := r.FormValue("date") + if len(dateStr) > 0 { + t, err := time.Parse("2006-01-02", dateStr) + if err != nil { + return fmt.Errorf("cannot parse `date` arg %q: %s", dateStr, err) + } + date = t.Unix() / secsPerDay + } + topN := 10 + topNStr := r.FormValue("topN") + if len(topNStr) > 0 { + n, err := strconv.Atoi(topNStr) + if err != nil { + return fmt.Errorf("cannot parse `topN` arg %q: %s", topNStr, err) + } + if n <= 0 { + n = 1 + } + if n > 1000 { + n = 1000 + } + topN = n + } + status, err := netstorage.GetTSDBStatusForDate(deadline, uint64(date), topN) + if err != nil { + return fmt.Errorf(`cannot obtain tsdb status for date=%d, topN=%d: %s`, date, topN, err) + } + w.Header().Set("Content-Type", "application/json") + WriteTSDBStatusResponse(w, status) + tsdbStatusDuration.UpdateDuration(startTime) + return nil +} + +var tsdbStatusDuration = metrics.NewSummary(`vm_request_duration_seconds{path="/api/v1/status/tsdb"}`) + // LabelsHandler processes /api/v1/labels request. // // See https://prometheus.io/docs/prometheus/latest/querying/api/#getting-label-names diff --git a/app/vmselect/prometheus/tsdb_status_response.qtpl b/app/vmselect/prometheus/tsdb_status_response.qtpl new file mode 100644 index 000000000..3f018eecf --- /dev/null +++ b/app/vmselect/prometheus/tsdb_status_response.qtpl @@ -0,0 +1,28 @@ +{% import "github.com/VictoriaMetrics/VictoriaMetrics/lib/storage" %} + +{% stripspace %} +TSDBStatusResponse generates response for /api/v1/status/tsdb . +{% func TSDBStatusResponse(status *storage.TSDBStatus) %} +{ + "status":"success", + "data":{ + "seriesCountByMetricName":{%= tsdbStatusEntries(status.SeriesCountByMetricName) %}, + "labelValueCountByLabelName":{%= tsdbStatusEntries(status.LabelValueCountByLabelName) %}, + "seriesCountByLabelValuePair":{%= tsdbStatusEntries(status.SeriesCountByLabelValuePair) %} + } +} +{% endfunc %} + +{% func tsdbStatusEntries(a []storage.TopHeapEntry) %} +[ + {% for i, e := range a %} + { + "name":{%q= e.Name %}, + "value":{%d= int(e.Count) %} + } + {% if i+1 < len(a) %},{% endif %} + {% endfor %} +] +{% endfunc %} + +{% endstripspace %} diff --git a/app/vmselect/prometheus/tsdb_status_response.qtpl.go b/app/vmselect/prometheus/tsdb_status_response.qtpl.go new file mode 100644 index 000000000..984bdef30 --- /dev/null +++ b/app/vmselect/prometheus/tsdb_status_response.qtpl.go @@ -0,0 +1,123 @@ +// Code generated by qtc from "tsdb_status_response.qtpl". DO NOT EDIT. +// See https://github.com/valyala/quicktemplate for details. + +//line app/vmselect/prometheus/tsdb_status_response.qtpl:1 +package prometheus + +//line app/vmselect/prometheus/tsdb_status_response.qtpl:1 +import "github.com/VictoriaMetrics/VictoriaMetrics/lib/storage" + +// TSDBStatusResponse generates response for /api/v1/status/tsdb . + +//line app/vmselect/prometheus/tsdb_status_response.qtpl:5 +import ( + qtio422016 "io" + + qt422016 "github.com/valyala/quicktemplate" +) + +//line app/vmselect/prometheus/tsdb_status_response.qtpl:5 +var ( + _ = qtio422016.Copy + _ = qt422016.AcquireByteBuffer +) + +//line app/vmselect/prometheus/tsdb_status_response.qtpl:5 +func StreamTSDBStatusResponse(qw422016 *qt422016.Writer, status *storage.TSDBStatus) { +//line app/vmselect/prometheus/tsdb_status_response.qtpl:5 + qw422016.N().S(`{"status":"success","data":{"seriesCountByMetricName":`) +//line app/vmselect/prometheus/tsdb_status_response.qtpl:9 + streamtsdbStatusEntries(qw422016, status.SeriesCountByMetricName) +//line app/vmselect/prometheus/tsdb_status_response.qtpl:9 + qw422016.N().S(`,"labelValueCountByLabelName":`) +//line app/vmselect/prometheus/tsdb_status_response.qtpl:10 + streamtsdbStatusEntries(qw422016, status.LabelValueCountByLabelName) +//line app/vmselect/prometheus/tsdb_status_response.qtpl:10 + qw422016.N().S(`,"seriesCountByLabelValuePair":`) +//line app/vmselect/prometheus/tsdb_status_response.qtpl:11 + streamtsdbStatusEntries(qw422016, status.SeriesCountByLabelValuePair) +//line app/vmselect/prometheus/tsdb_status_response.qtpl:11 + qw422016.N().S(`}}`) +//line app/vmselect/prometheus/tsdb_status_response.qtpl:14 +} + +//line app/vmselect/prometheus/tsdb_status_response.qtpl:14 +func WriteTSDBStatusResponse(qq422016 qtio422016.Writer, status *storage.TSDBStatus) { +//line app/vmselect/prometheus/tsdb_status_response.qtpl:14 + qw422016 := qt422016.AcquireWriter(qq422016) +//line app/vmselect/prometheus/tsdb_status_response.qtpl:14 + StreamTSDBStatusResponse(qw422016, status) +//line app/vmselect/prometheus/tsdb_status_response.qtpl:14 + qt422016.ReleaseWriter(qw422016) +//line app/vmselect/prometheus/tsdb_status_response.qtpl:14 +} + +//line app/vmselect/prometheus/tsdb_status_response.qtpl:14 +func TSDBStatusResponse(status *storage.TSDBStatus) string { +//line app/vmselect/prometheus/tsdb_status_response.qtpl:14 + qb422016 := qt422016.AcquireByteBuffer() +//line app/vmselect/prometheus/tsdb_status_response.qtpl:14 + WriteTSDBStatusResponse(qb422016, status) +//line app/vmselect/prometheus/tsdb_status_response.qtpl:14 + qs422016 := string(qb422016.B) +//line app/vmselect/prometheus/tsdb_status_response.qtpl:14 + qt422016.ReleaseByteBuffer(qb422016) +//line app/vmselect/prometheus/tsdb_status_response.qtpl:14 + return qs422016 +//line app/vmselect/prometheus/tsdb_status_response.qtpl:14 +} + +//line app/vmselect/prometheus/tsdb_status_response.qtpl:16 +func streamtsdbStatusEntries(qw422016 *qt422016.Writer, a []storage.TopHeapEntry) { +//line app/vmselect/prometheus/tsdb_status_response.qtpl:16 + qw422016.N().S(`[`) +//line app/vmselect/prometheus/tsdb_status_response.qtpl:18 + for i, e := range a { +//line app/vmselect/prometheus/tsdb_status_response.qtpl:18 + qw422016.N().S(`{"name":`) +//line app/vmselect/prometheus/tsdb_status_response.qtpl:20 + qw422016.N().Q(e.Name) +//line app/vmselect/prometheus/tsdb_status_response.qtpl:20 + qw422016.N().S(`,"value":`) +//line app/vmselect/prometheus/tsdb_status_response.qtpl:21 + qw422016.N().D(int(e.Count)) +//line app/vmselect/prometheus/tsdb_status_response.qtpl:21 + qw422016.N().S(`}`) +//line app/vmselect/prometheus/tsdb_status_response.qtpl:23 + if i+1 < len(a) { +//line app/vmselect/prometheus/tsdb_status_response.qtpl:23 + qw422016.N().S(`,`) +//line app/vmselect/prometheus/tsdb_status_response.qtpl:23 + } +//line app/vmselect/prometheus/tsdb_status_response.qtpl:24 + } +//line app/vmselect/prometheus/tsdb_status_response.qtpl:24 + qw422016.N().S(`]`) +//line app/vmselect/prometheus/tsdb_status_response.qtpl:26 +} + +//line app/vmselect/prometheus/tsdb_status_response.qtpl:26 +func writetsdbStatusEntries(qq422016 qtio422016.Writer, a []storage.TopHeapEntry) { +//line app/vmselect/prometheus/tsdb_status_response.qtpl:26 + qw422016 := qt422016.AcquireWriter(qq422016) +//line app/vmselect/prometheus/tsdb_status_response.qtpl:26 + streamtsdbStatusEntries(qw422016, a) +//line app/vmselect/prometheus/tsdb_status_response.qtpl:26 + qt422016.ReleaseWriter(qw422016) +//line app/vmselect/prometheus/tsdb_status_response.qtpl:26 +} + +//line app/vmselect/prometheus/tsdb_status_response.qtpl:26 +func tsdbStatusEntries(a []storage.TopHeapEntry) string { +//line app/vmselect/prometheus/tsdb_status_response.qtpl:26 + qb422016 := qt422016.AcquireByteBuffer() +//line app/vmselect/prometheus/tsdb_status_response.qtpl:26 + writetsdbStatusEntries(qb422016, a) +//line app/vmselect/prometheus/tsdb_status_response.qtpl:26 + qs422016 := string(qb422016.B) +//line app/vmselect/prometheus/tsdb_status_response.qtpl:26 + qt422016.ReleaseByteBuffer(qb422016) +//line app/vmselect/prometheus/tsdb_status_response.qtpl:26 + return qs422016 +//line app/vmselect/prometheus/tsdb_status_response.qtpl:26 +} diff --git a/app/vmstorage/main.go b/app/vmstorage/main.go index d01612a63..2a37731bd 100644 --- a/app/vmstorage/main.go +++ b/app/vmstorage/main.go @@ -120,6 +120,14 @@ func SearchTagEntries(maxTagKeys, maxTagValues int) ([]storage.TagEntry, error) return tagEntries, err } +// GetTSDBStatusForDate returns TSDB status for the given date. +func GetTSDBStatusForDate(date uint64, topN int) (*storage.TSDBStatus, error) { + WG.Add(1) + status, err := Storage.GetTSDBStatusForDate(date, topN) + WG.Done() + return status, err +} + // GetSeriesCount returns the number of time series in the storage. func GetSeriesCount() (uint64, error) { WG.Add(1) diff --git a/docs/Single-server-VictoriaMetrics.md b/docs/Single-server-VictoriaMetrics.md index 025aae1c8..ef5ebe021 100644 --- a/docs/Single-server-VictoriaMetrics.md +++ b/docs/Single-server-VictoriaMetrics.md @@ -497,6 +497,7 @@ VictoriaMetrics supports the following handlers from [Prometheus querying API](h * [/api/v1/series](https://prometheus.io/docs/prometheus/latest/querying/api/#finding-series-by-label-matchers) * [/api/v1/labels](https://prometheus.io/docs/prometheus/latest/querying/api/#getting-label-names) * [/api/v1/label/.../values](https://prometheus.io/docs/prometheus/latest/querying/api/#querying-label-values) +* [/api/v1/status/tsdb](https://prometheus.io/docs/prometheus/latest/querying/api/#tsdb-stats) These handlers can be queried from Prometheus-compatible clients such as Grafana or curl. @@ -926,6 +927,12 @@ The most interesting metrics are: If the gaps are related to irregular intervals between samples, then try adjusting `-search.minStalenessInterval` command-line flag to value close to the maximum interval between samples. +* Metrics and labels leading to high cardinality or high churn rate can be determined at `/api/v1/status/tsdb` page. + See [these docs](https://prometheus.io/docs/prometheus/latest/querying/api/#tsdb-stats) for details. + VictoriaMetrics accepts optional `date=YYYY-MM-DD` and `topN=42` args on this page. By default `date` equals to the current date, + while `topN` equals to 10. + + ### Backfilling VictoriaMetrics accepts historical data in arbitrary order of time via [any supported ingestion method](#how-to-import-time-series-data). diff --git a/lib/storage/index_db.go b/lib/storage/index_db.go index bc1f9006c..36a0d3dd8 100644 --- a/lib/storage/index_db.go +++ b/lib/storage/index_db.go @@ -2,6 +2,7 @@ package storage import ( "bytes" + "container/heap" "errors" "fmt" "io" @@ -882,11 +883,232 @@ func (db *indexDB) GetSeriesCount() (uint64, error) { extDB.putIndexSearch(is) }) if ok && err != nil { - return 0, err + return 0, fmt.Errorf("error when searching in extDB: %s", err) } return n + nExt, nil } +func (is *indexSearch) getSeriesCount() (uint64, error) { + ts := &is.ts + kb := &is.kb + mp := &is.mp + var metricIDsLen uint64 + // Extract the number of series from ((__name__=value): metricIDs) rows + kb.B = marshalCommonPrefix(kb.B[:0], nsPrefixTagToMetricIDs) + kb.B = marshalTagValue(kb.B, nil) + ts.Seek(kb.B) + for ts.NextItem() { + item := ts.Item + if !bytes.HasPrefix(item, kb.B) { + break + } + tail := item[len(kb.B):] + n := bytes.IndexByte(tail, tagSeparatorChar) + if n < 0 { + return 0, fmt.Errorf("invalid tag->metricIDs line %q: cannot find tagSeparatorChar %d", item, tagSeparatorChar) + } + tail = tail[n+1:] + if err := mp.InitOnlyTail(item, tail); err != nil { + return 0, err + } + // Take into account deleted timeseries too. + // It is OK if series can be counted multiple times in rare cases - + // the returned number is an estimation. + metricIDsLen += uint64(mp.MetricIDsLen()) + } + if err := ts.Error(); err != nil { + return 0, fmt.Errorf("error when counting unique timeseries: %s", err) + } + return metricIDsLen, nil +} + +// GetTSDBStatusForDate returns topN entries for tsdb status for the given date. +func (db *indexDB) GetTSDBStatusForDate(date uint64, topN int) (*TSDBStatus, error) { + is := db.getIndexSearch() + status, err := is.getTSDBStatusForDate(date, topN) + db.putIndexSearch(is) + if err != nil { + return nil, err + } + if status.hasEntries() { + // The entries were found in the db. There is no need in searching them in extDB. + return status, nil + } + + // The entries weren't found in the db. Try searching them in extDB. + ok := db.doExtDB(func(extDB *indexDB) { + is := extDB.getIndexSearch() + status, err = is.getTSDBStatusForDate(date, topN) + extDB.putIndexSearch(is) + }) + if ok && err != nil { + return nil, fmt.Errorf("error when obtaining TSDB status from extDB: %s", err) + } + return status, nil +} + +func (is *indexSearch) getTSDBStatusForDate(date uint64, topN int) (*TSDBStatus, error) { + ts := &is.ts + kb := &is.kb + mp := &is.mp + thLabelValueCountByLabelName := newTopHeap(topN) + thSeriesCountByLabelValuePair := newTopHeap(topN) + thSeriesCountByMetricName := newTopHeap(topN) + var tmp, labelName, labelNameValue []byte + var labelValueCountByLabelName, seriesCountByLabelValuePair uint64 + nameEqualBytes := []byte("__name__=") + + kb.B = marshalCommonPrefix(kb.B[:0], nsPrefixDateTagToMetricIDs) + kb.B = encoding.MarshalUint64(kb.B, date) + prefix := kb.B + ts.Seek(prefix) + for ts.NextItem() { + item := ts.Item + if !bytes.HasPrefix(item, prefix) { + break + } + tail := item[len(prefix):] + var err error + tail, tmp, err = unmarshalTagValue(tmp[:0], tail) + if err != nil { + return nil, fmt.Errorf("cannot unmarshal tag key from line %q: %s", item, err) + } + if len(tmp) == 0 { + tmp = append(tmp, "__name__"...) + } + if !bytes.Equal(tmp, labelName) { + thLabelValueCountByLabelName.pushIfNonEmpty(labelName, labelValueCountByLabelName) + labelValueCountByLabelName = 0 + labelName = append(labelName[:0], tmp...) + } + tmp = append(tmp, '=') + tail, tmp, err = unmarshalTagValue(tmp, tail) + if err != nil { + return nil, fmt.Errorf("cannot unmarshal tag value from line %q: %s", item, err) + } + if !bytes.Equal(tmp, labelNameValue) { + thSeriesCountByLabelValuePair.pushIfNonEmpty(labelNameValue, seriesCountByLabelValuePair) + if bytes.HasPrefix(labelNameValue, nameEqualBytes) { + thSeriesCountByMetricName.pushIfNonEmpty(labelNameValue[len(nameEqualBytes):], seriesCountByLabelValuePair) + } + seriesCountByLabelValuePair = 0 + labelValueCountByLabelName++ + labelNameValue = append(labelNameValue[:0], tmp...) + } + if err := mp.InitOnlyTail(item, tail); err != nil { + return nil, err + } + // Take into account deleted timeseries too. + // It is OK if series can be counted multiple times in rare cases - + // the returned number is an estimation. + seriesCountByLabelValuePair += uint64(mp.MetricIDsLen()) + } + if err := ts.Error(); err != nil { + return nil, fmt.Errorf("error when counting time series by metric names: %s", err) + } + thLabelValueCountByLabelName.pushIfNonEmpty(labelName, labelValueCountByLabelName) + thSeriesCountByLabelValuePair.pushIfNonEmpty(labelNameValue, seriesCountByLabelValuePair) + if bytes.HasPrefix(labelNameValue, nameEqualBytes) { + thSeriesCountByMetricName.pushIfNonEmpty(labelNameValue[len(nameEqualBytes):], seriesCountByLabelValuePair) + } + status := &TSDBStatus{ + SeriesCountByMetricName: thSeriesCountByMetricName.getSortedResult(), + LabelValueCountByLabelName: thLabelValueCountByLabelName.getSortedResult(), + SeriesCountByLabelValuePair: thSeriesCountByLabelValuePair.getSortedResult(), + } + return status, nil +} + +// TSDBStatus contains TSDB status data for /api/v1/status/tsdb. +// +// See https://prometheus.io/docs/prometheus/latest/querying/api/#tsdb-stats +type TSDBStatus struct { + SeriesCountByMetricName []TopHeapEntry + LabelValueCountByLabelName []TopHeapEntry + SeriesCountByLabelValuePair []TopHeapEntry +} + +func (status *TSDBStatus) hasEntries() bool { + return len(status.SeriesCountByLabelValuePair) > 0 +} + +// topHeap maintains a heap of topHeapEntries with the maximum TopHeapEntry.n values. +type topHeap struct { + topN int + a []TopHeapEntry +} + +// newTopHeap returns topHeap for topN items. +func newTopHeap(topN int) *topHeap { + return &topHeap{ + topN: topN, + } +} + +// TopHeapEntry represents an entry from `top heap` used in stats. +type TopHeapEntry struct { + Name string + Count uint64 +} + +func (th *topHeap) pushIfNonEmpty(name []byte, count uint64) { + if count == 0 { + return + } + if len(th.a) < th.topN { + th.a = append(th.a, TopHeapEntry{ + Name: string(name), + Count: count, + }) + heap.Fix(th, len(th.a)-1) + return + } + if count <= th.a[0].Count { + return + } + th.a[0] = TopHeapEntry{ + Name: string(name), + Count: count, + } + heap.Fix(th, 0) +} + +func (th *topHeap) getSortedResult() []TopHeapEntry { + result := append([]TopHeapEntry{}, th.a...) + sort.Slice(result, func(i, j int) bool { + a, b := result[i], result[j] + if a.Count != b.Count { + return a.Count > b.Count + } + return a.Name < b.Name + }) + return result +} + +// heap.Interface implementation for topHeap. + +func (th *topHeap) Len() int { + return len(th.a) +} + +func (th *topHeap) Less(i, j int) bool { + a := th.a + return a[i].Count < a[j].Count +} + +func (th *topHeap) Swap(i, j int) { + a := th.a + a[j], a[i] = a[i], a[j] +} + +func (th *topHeap) Push(x interface{}) { + panic(fmt.Errorf("BUG: Push shouldn't be called")) +} + +func (th *topHeap) Pop() interface{} { + panic(fmt.Errorf("BUG: Pop shouldn't be called")) +} + // searchMetricName appends metric name for the given metricID to dst // and returns the result. func (db *indexDB) searchMetricName(dst []byte, metricID uint64) ([]byte, error) { @@ -1314,40 +1536,6 @@ func (is *indexSearch) getTSIDByMetricID(dst *TSID, metricID uint64) error { return nil } -func (is *indexSearch) getSeriesCount() (uint64, error) { - ts := &is.ts - kb := &is.kb - mp := &is.mp - var metricIDsLen uint64 - // Extract the number of series from ((__name__=value): metricIDs) rows - kb.B = marshalCommonPrefix(kb.B[:0], nsPrefixTagToMetricIDs) - kb.B = marshalTagValue(kb.B, nil) - ts.Seek(kb.B) - for ts.NextItem() { - item := ts.Item - if !bytes.HasPrefix(item, kb.B) { - break - } - tail := item[len(kb.B):] - n := bytes.IndexByte(tail, tagSeparatorChar) - if n < 0 { - return 0, fmt.Errorf("invalid tag->metricIDs line %q: cannot find tagSeparatorChar %d", item, tagSeparatorChar) - } - tail = tail[n+1:] - if err := mp.InitOnlyTail(item, tail); err != nil { - return 0, err - } - // Take into account deleted timeseries too. - // It is OK if series can be counted multiple times in rare cases - - // the returned number is an estimation. - metricIDsLen += uint64(mp.MetricIDsLen()) - } - if err := ts.Error(); err != nil { - return 0, fmt.Errorf("error when counting unique timeseries: %s", err) - } - return metricIDsLen, nil -} - // updateMetricIDsByMetricNameMatch matches metricName values for the given srcMetricIDs against tfs // and adds matching metrics to metricIDs. func (is *indexSearch) updateMetricIDsByMetricNameMatch(metricIDs, srcMetricIDs *uint64set.Set, tfs []*tagFilter) error { diff --git a/lib/storage/index_db_test.go b/lib/storage/index_db_test.go index e947a4dc5..171f8a71e 100644 --- a/lib/storage/index_db_test.go +++ b/lib/storage/index_db_test.go @@ -1540,6 +1540,70 @@ func TestSearchTSIDWithTimeRange(t *testing.T) { if len(matchedTSIDs) != metricsPerDay*days { t.Fatal("Expected time series for all days, got", len(matchedTSIDs)) } + + // Check GetTSDBStatusForDate + status, err := db.GetTSDBStatusForDate(baseDate, 5) + if err != nil { + t.Fatalf("error in GetTSDBStatusForDate: %s", err) + } + if !status.hasEntries() { + t.Fatalf("expecting non-empty TSDB status") + } + expectedSeriesCountByMetricName := []TopHeapEntry{ + { + Name: "testMetric", + Count: 1000, + }, + } + if !reflect.DeepEqual(status.SeriesCountByMetricName, expectedSeriesCountByMetricName) { + t.Fatalf("unexpected SeriesCountByMetricName;\ngot\n%v\nwant\n%v", status.SeriesCountByMetricName, expectedSeriesCountByMetricName) + } + expectedLabelValueCountByLabelName := []TopHeapEntry{ + { + Name: "uniqueid", + Count: 1000, + }, + { + Name: "__name__", + Count: 1, + }, + { + Name: "constant", + Count: 1, + }, + { + Name: "day", + Count: 1, + }, + } + if !reflect.DeepEqual(status.LabelValueCountByLabelName, expectedLabelValueCountByLabelName) { + t.Fatalf("unexpected LabelValueCountByLabelName;\ngot\n%v\nwant\n%v", status.LabelValueCountByLabelName, expectedLabelValueCountByLabelName) + } + expectedSeriesCountByLabelValuePair := []TopHeapEntry{ + { + Name: "__name__=testMetric", + Count: 1000, + }, + { + Name: "constant=const", + Count: 1000, + }, + { + Name: "day=0", + Count: 1000, + }, + { + Name: "uniqueid=0", + Count: 1, + }, + { + Name: "uniqueid=1", + Count: 1, + }, + } + if !reflect.DeepEqual(status.SeriesCountByLabelValuePair, expectedSeriesCountByLabelValuePair) { + t.Fatalf("unexpected SeriesCountByLabelValuePair;\ngot\n%v\nwant\n%v", status.SeriesCountByLabelValuePair, expectedSeriesCountByLabelValuePair) + } } func toTFPointers(tfs []tagFilter) []*tagFilter { diff --git a/lib/storage/storage.go b/lib/storage/storage.go index 98296128d..3f6bb5cdf 100644 --- a/lib/storage/storage.go +++ b/lib/storage/storage.go @@ -780,6 +780,13 @@ func (s *Storage) GetSeriesCount() (uint64, error) { return s.idb().GetSeriesCount() } +// GetTSDBStatusForDate returns TSDB status data for /api/v1/status/tsdb. +// +// See https://prometheus.io/docs/prometheus/latest/querying/api/#tsdb-stats +func (s *Storage) GetTSDBStatusForDate(date uint64, topN int) (*TSDBStatus, error) { + return s.idb().GetTSDBStatusForDate(date, topN) +} + // MetricRow is a metric to insert into storage. type MetricRow struct { // MetricNameRaw contains raw metric name, which must be decoded