VictoriaMetrics/lib/promscrape/targetstatus.go
Aliaksandr Valialkin b6d6a3a530
lib/promscrape: show dropped targets because of sharding at /service-discovery page
Previously the /service-discovery page didn't show targets dropped because of sharding
( https://docs.victoriametrics.com/vmagent.html#scraping-big-number-of-targets ).

Show also the reason why every target is dropped at /service-discovery page.
This should improve debuging why particular targets are dropped.

While at it, do not remove dropped targets from the list at /service-discovery page
until the total number of targets exceeds the limit passed to -promscrape.maxDroppedTargets .
Previously the list was cleaned up every 10 minutes from the entries, which weren't updated
for the last minute. This could complicate debugging of dropped targets.

Updates https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5389
2023-12-04 17:42:46 +02:00

627 lines
17 KiB
Go

package promscrape
import (
"flag"
"fmt"
"io"
"net/http"
"regexp"
"sort"
"strconv"
"strings"
"sync"
"time"
"unsafe"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/promrelabel"
"github.com/VictoriaMetrics/VictoriaMetrics/lib/promutils"
"github.com/cespare/xxhash/v2"
)
var maxDroppedTargets = flag.Int("promscrape.maxDroppedTargets", 1000, "The maximum number of droppedTargets to show at /api/v1/targets page. "+
"Increase this value if your setup drops more scrape targets during relabeling and you need investigating labels for all the dropped targets. "+
"Note that the increased number of tracked dropped targets may result in increased memory usage")
var tsmGlobal = newTargetStatusMap()
// WriteTargetResponse serves requests to /target_response?id=<id>
//
// It fetches response for the given target id and returns it.
func WriteTargetResponse(w http.ResponseWriter, r *http.Request) error {
targetID := r.FormValue("id")
sw := tsmGlobal.getScrapeWorkByTargetID(targetID)
if sw == nil {
return fmt.Errorf("cannot find target for id=%s", targetID)
}
data, err := sw.getTargetResponse()
if err != nil {
return fmt.Errorf("cannot fetch response from id=%s: %w", targetID, err)
}
w.Header().Set("Content-Type", "text/plain; charset=utf-8")
_, err = w.Write(data)
return err
}
// WriteHumanReadableTargetsStatus writes human-readable status for all the scrape targets to w according to r.
func WriteHumanReadableTargetsStatus(w http.ResponseWriter, r *http.Request) {
filter := getRequestFilter(r)
tsr := tsmGlobal.getTargetsStatusByJob(filter)
if accept := r.Header.Get("Accept"); strings.Contains(accept, "text/html") {
w.Header().Set("Content-Type", "text/html; charset=utf-8")
WriteTargetsResponseHTML(w, tsr, filter)
} else {
w.Header().Set("Content-Type", "text/plain; charset=utf-8")
WriteTargetsResponsePlain(w, tsr, filter)
}
}
// WriteServiceDiscovery writes /service-discovery response to w similar to http://demo.robustperception.io:9090/service-discovery
func WriteServiceDiscovery(w http.ResponseWriter, r *http.Request) {
filter := getRequestFilter(r)
tsr := tsmGlobal.getTargetsStatusByJob(filter)
w.Header().Set("Content-Type", "text/html; charset=utf-8")
WriteServiceDiscoveryResponse(w, tsr, filter)
}
// WriteAPIV1Targets writes /api/v1/targets to w according to https://prometheus.io/docs/prometheus/latest/querying/api/#targets
func WriteAPIV1Targets(w io.Writer, state string) {
if state == "" {
state = "any"
}
fmt.Fprintf(w, `{"status":"success","data":{"activeTargets":`)
if state == "active" || state == "any" {
tsmGlobal.WriteActiveTargetsJSON(w)
} else {
fmt.Fprintf(w, `[]`)
}
fmt.Fprintf(w, `,"droppedTargets":`)
if state == "dropped" || state == "any" {
droppedTargetsMap.WriteDroppedTargetsJSON(w)
} else {
fmt.Fprintf(w, `[]`)
}
fmt.Fprintf(w, `}}`)
}
type targetStatusMap struct {
mu sync.Mutex
m map[*scrapeWork]*targetStatus
jobNames []string
}
func newTargetStatusMap() *targetStatusMap {
return &targetStatusMap{
m: make(map[*scrapeWork]*targetStatus),
}
}
func (tsm *targetStatusMap) Reset() {
tsm.mu.Lock()
tsm.m = make(map[*scrapeWork]*targetStatus)
tsm.mu.Unlock()
}
func (tsm *targetStatusMap) registerJobNames(jobNames []string) {
tsm.mu.Lock()
tsm.jobNames = append(tsm.jobNames[:0], jobNames...)
tsm.mu.Unlock()
}
func (tsm *targetStatusMap) Register(sw *scrapeWork) {
tsm.mu.Lock()
tsm.m[sw] = &targetStatus{
sw: sw,
}
tsm.mu.Unlock()
}
func (tsm *targetStatusMap) Unregister(sw *scrapeWork) {
tsm.mu.Lock()
delete(tsm.m, sw)
tsm.mu.Unlock()
}
func (tsm *targetStatusMap) Update(sw *scrapeWork, up bool, scrapeTime, scrapeDuration int64, samplesScraped int, err error) {
tsm.mu.Lock()
ts := tsm.m[sw]
if ts == nil {
ts = &targetStatus{
sw: sw,
}
tsm.m[sw] = ts
}
ts.up = up
ts.scrapeTime = scrapeTime
ts.scrapeDuration = scrapeDuration
ts.samplesScraped = samplesScraped
ts.scrapesTotal++
if !up {
ts.scrapesFailed++
}
ts.err = err
tsm.mu.Unlock()
}
func (tsm *targetStatusMap) getScrapeWorkByTargetID(targetID string) *scrapeWork {
tsm.mu.Lock()
defer tsm.mu.Unlock()
for sw := range tsm.m {
// The target is uniquely identified by a pointer to its original labels.
if getLabelsID(sw.Config.OriginalLabels) == targetID {
return sw
}
}
return nil
}
func getLabelsID(labels *promutils.Labels) string {
return fmt.Sprintf("%016x", uintptr(unsafe.Pointer(labels)))
}
// StatusByGroup returns the number of targets with status==up
// for the given group name
func (tsm *targetStatusMap) StatusByGroup(group string, up bool) int {
var count int
tsm.mu.Lock()
for _, ts := range tsm.m {
if ts.sw.ScrapeGroup == group && ts.up == up {
count++
}
}
tsm.mu.Unlock()
return count
}
func (tsm *targetStatusMap) getActiveTargetStatuses() []targetStatus {
tsm.mu.Lock()
tss := make([]targetStatus, 0, len(tsm.m))
for _, ts := range tsm.m {
tss = append(tss, *ts)
}
tsm.mu.Unlock()
// Sort discovered targets by __address__ label, so they stay in consistent order across calls
sort.Slice(tss, func(i, j int) bool {
addr1 := tss[i].sw.Config.OriginalLabels.Get("__address__")
addr2 := tss[j].sw.Config.OriginalLabels.Get("__address__")
return addr1 < addr2
})
return tss
}
// WriteActiveTargetsJSON writes `activeTargets` contents to w according to https://prometheus.io/docs/prometheus/latest/querying/api/#targets
func (tsm *targetStatusMap) WriteActiveTargetsJSON(w io.Writer) {
tss := tsm.getActiveTargetStatuses()
fmt.Fprintf(w, `[`)
for i, ts := range tss {
fmt.Fprintf(w, `{"discoveredLabels":`)
writeLabelsJSON(w, ts.sw.Config.OriginalLabels)
fmt.Fprintf(w, `,"labels":`)
writeLabelsJSON(w, ts.sw.Config.Labels)
fmt.Fprintf(w, `,"scrapePool":%q`, ts.sw.Config.Job())
fmt.Fprintf(w, `,"scrapeUrl":%q`, ts.sw.Config.ScrapeURL)
errMsg := ""
if ts.err != nil {
errMsg = ts.err.Error()
}
fmt.Fprintf(w, `,"lastError":%q`, errMsg)
fmt.Fprintf(w, `,"lastScrape":%q`, time.Unix(ts.scrapeTime/1000, (ts.scrapeTime%1000)*1e6).Format(time.RFC3339Nano))
fmt.Fprintf(w, `,"lastScrapeDuration":%g`, (time.Millisecond * time.Duration(ts.scrapeDuration)).Seconds())
fmt.Fprintf(w, `,"lastSamplesScraped":%d`, ts.samplesScraped)
state := "up"
if !ts.up {
state = "down"
}
fmt.Fprintf(w, `,"health":%q}`, state)
if i+1 < len(tss) {
fmt.Fprintf(w, `,`)
}
}
fmt.Fprintf(w, `]`)
}
func writeLabelsJSON(w io.Writer, labels *promutils.Labels) {
fmt.Fprintf(w, `{`)
labelsList := labels.GetLabels()
for i, label := range labelsList {
fmt.Fprintf(w, "%q:%q", label.Name, label.Value)
if i+1 < len(labelsList) {
fmt.Fprintf(w, `,`)
}
}
fmt.Fprintf(w, `}`)
}
type targetStatus struct {
sw *scrapeWork
up bool
scrapeTime int64
scrapeDuration int64
samplesScraped int
scrapesTotal int
scrapesFailed int
err error
}
func (ts *targetStatus) getDurationFromLastScrape() time.Duration {
return time.Since(time.Unix(ts.scrapeTime/1000, (ts.scrapeTime%1000)*1e6))
}
type droppedTargets struct {
mu sync.Mutex
m map[uint64]droppedTarget
}
type droppedTarget struct {
originalLabels *promutils.Labels
relabelConfigs *promrelabel.ParsedConfigs
dropReason targetDropReason
}
type targetDropReason string
const (
targetDropReasonRelabeling = targetDropReason("relabeling") // target dropped because of relabeling
targetDropReasonMissingScrapeURL = targetDropReason("missing scrape URL") // target dropped because of missing scrape URL
targetDropReasonDuplicate = targetDropReason("duplicate") // target with the given set of labels already exists
targetDropReasonSharding = targetDropReason("sharding") // target is dropped becase of sharding https://docs.victoriametrics.com/vmagent.html#scraping-big-number-of-targets
)
func (dt *droppedTargets) getTargetsList() []droppedTarget {
dt.mu.Lock()
dts := make([]droppedTarget, 0, len(dt.m))
for _, v := range dt.m {
dts = append(dts, v)
}
dt.mu.Unlock()
// Sort discovered targets by __address__ label, so they stay in consistent order across calls
sort.Slice(dts, func(i, j int) bool {
addr1 := dts[i].originalLabels.Get("__address__")
addr2 := dts[j].originalLabels.Get("__address__")
return addr1 < addr2
})
return dts
}
// Register registers dropped target with the given originalLabels.
//
// The relabelConfigs must contain relabel configs, which were applied to originalLabels.
// The reason must contain the reason why the target has been dropped.
func (dt *droppedTargets) Register(originalLabels *promutils.Labels, relabelConfigs *promrelabel.ParsedConfigs, reason targetDropReason) {
if originalLabels == nil {
// Do not register target without originalLabels. This is the case when *dropOriginalLabels is set to true.
return
}
// It is better to have hash collisions instead of spending additional CPU on originalLabels.String() call.
key := labelsHash(originalLabels)
dt.mu.Lock()
dt.m[key] = droppedTarget{
originalLabels: originalLabels,
relabelConfigs: relabelConfigs,
dropReason: reason,
}
if len(dt.m) >= *maxDroppedTargets {
for k := range dt.m {
delete(dt.m, k)
if len(dt.m) < *maxDroppedTargets {
break
}
}
}
dt.mu.Unlock()
}
func labelsHash(labels *promutils.Labels) uint64 {
d := xxhashPool.Get().(*xxhash.Digest)
for _, label := range labels.GetLabels() {
_, _ = d.WriteString(label.Name)
_, _ = d.WriteString(label.Value)
}
h := d.Sum64()
d.Reset()
xxhashPool.Put(d)
return h
}
var xxhashPool = &sync.Pool{
New: func() interface{} {
return xxhash.New()
},
}
// WriteDroppedTargetsJSON writes `droppedTargets` contents to w according to https://prometheus.io/docs/prometheus/latest/querying/api/#targets
func (dt *droppedTargets) WriteDroppedTargetsJSON(w io.Writer) {
dts := dt.getTargetsList()
fmt.Fprintf(w, `[`)
for i, dt := range dts {
fmt.Fprintf(w, `{"discoveredLabels":`)
writeLabelsJSON(w, dt.originalLabels)
fmt.Fprintf(w, `}`)
if i+1 < len(dts) {
fmt.Fprintf(w, `,`)
}
}
fmt.Fprintf(w, `]`)
}
var droppedTargetsMap = &droppedTargets{
m: make(map[uint64]droppedTarget),
}
type jobTargetsStatuses struct {
jobName string
upCount int
targetsTotal int
targetsStatus []targetStatus
}
func (tsm *targetStatusMap) getTargetsStatusByJob(filter *requestFilter) *targetsStatusResult {
byJob := make(map[string][]targetStatus)
tsm.mu.Lock()
for _, ts := range tsm.m {
jobName := ts.sw.Config.jobNameOriginal
byJob[jobName] = append(byJob[jobName], *ts)
}
jobNames := append([]string{}, tsm.jobNames...)
tsm.mu.Unlock()
var jts []*jobTargetsStatuses
for jobName, statuses := range byJob {
sort.Slice(statuses, func(i, j int) bool {
return statuses[i].sw.Config.ScrapeURL < statuses[j].sw.Config.ScrapeURL
})
ups := 0
var targetsStatuses []targetStatus
for _, ts := range statuses {
if ts.up {
ups++
}
if filter.showOnlyUnhealthy && ts.up {
continue
}
targetsStatuses = append(targetsStatuses, ts)
}
jts = append(jts, &jobTargetsStatuses{
jobName: jobName,
upCount: ups,
targetsTotal: len(statuses),
targetsStatus: targetsStatuses,
})
}
sort.Slice(jts, func(i, j int) bool {
return jts[i].jobName < jts[j].jobName
})
emptyJobs := getEmptyJobs(jts, jobNames)
var err error
jts, err = filterTargets(jts, filter.endpointSearch, filter.labelSearch)
if len(filter.endpointSearch) > 0 || len(filter.labelSearch) > 0 {
// Do not show empty jobs if target filters are set.
emptyJobs = nil
}
dts := droppedTargetsMap.getTargetsList()
return &targetsStatusResult{
hasOriginalLabels: !*dropOriginalLabels,
jobTargetsStatuses: jts,
droppedTargets: dts,
emptyJobs: emptyJobs,
err: err,
}
}
func filterTargetsByEndpoint(jts []*jobTargetsStatuses, searchQuery string) ([]*jobTargetsStatuses, error) {
if searchQuery == "" {
return jts, nil
}
finder, err := regexp.Compile(searchQuery)
if err != nil {
return nil, fmt.Errorf("cannot parse %s: %w", searchQuery, err)
}
var jtsFiltered []*jobTargetsStatuses
for _, job := range jts {
var tss []targetStatus
for _, ts := range job.targetsStatus {
if finder.MatchString(ts.sw.Config.ScrapeURL) {
tss = append(tss, ts)
}
}
if len(tss) == 0 {
// Skip jobs with zero targets after filtering, so users could see only the requested targets
continue
}
job.targetsStatus = tss
jtsFiltered = append(jtsFiltered, job)
}
return jtsFiltered, nil
}
func filterTargetsByLabels(jts []*jobTargetsStatuses, searchQuery string) ([]*jobTargetsStatuses, error) {
if searchQuery == "" {
return jts, nil
}
var ie promrelabel.IfExpression
if err := ie.Parse(searchQuery); err != nil {
return nil, fmt.Errorf("cannot parse %s: %w", searchQuery, err)
}
var jtsFiltered []*jobTargetsStatuses
for _, job := range jts {
var tss []targetStatus
for _, ts := range job.targetsStatus {
labels := ts.sw.Config.Labels.GetLabels()
if ie.Match(labels) {
tss = append(tss, ts)
}
}
if len(tss) == 0 {
// Skip jobs with zero targets after filtering, so users could see only the requested targets
continue
}
job.targetsStatus = tss
jtsFiltered = append(jtsFiltered, job)
}
return jtsFiltered, nil
}
func filterTargets(jts []*jobTargetsStatuses, endpointQuery, labelQuery string) ([]*jobTargetsStatuses, error) {
var err error
jts, err = filterTargetsByEndpoint(jts, endpointQuery)
if err != nil {
return nil, err
}
jts, err = filterTargetsByLabels(jts, labelQuery)
if err != nil {
return nil, err
}
return jts, nil
}
func getEmptyJobs(jts []*jobTargetsStatuses, jobNames []string) []string {
jobNamesMap := make(map[string]struct{}, len(jobNames))
for _, jobName := range jobNames {
jobNamesMap[jobName] = struct{}{}
}
for i := range jts {
delete(jobNamesMap, jts[i].jobName)
}
emptyJobs := make([]string, 0, len(jobNamesMap))
for k := range jobNamesMap {
emptyJobs = append(emptyJobs, k)
}
sort.Strings(emptyJobs)
return emptyJobs
}
type requestFilter struct {
showOriginalLabels bool
showOnlyUnhealthy bool
endpointSearch string
labelSearch string
}
func getRequestFilter(r *http.Request) *requestFilter {
showOriginalLabels, _ := strconv.ParseBool(r.FormValue("show_original_labels"))
showOnlyUnhealthy, _ := strconv.ParseBool(r.FormValue("show_only_unhealthy"))
endpointSearch := strings.TrimSpace(r.FormValue("endpoint_search"))
labelSearch := strings.TrimSpace(r.FormValue("label_search"))
return &requestFilter{
showOriginalLabels: showOriginalLabels,
showOnlyUnhealthy: showOnlyUnhealthy,
endpointSearch: endpointSearch,
labelSearch: labelSearch,
}
}
type targetsStatusResult struct {
hasOriginalLabels bool
jobTargetsStatuses []*jobTargetsStatuses
droppedTargets []droppedTarget
emptyJobs []string
err error
}
type targetLabels struct {
up bool
originalLabels *promutils.Labels
labels *promutils.Labels
dropReason targetDropReason
}
type targetLabelsByJob struct {
jobName string
targets []targetLabels
activeTargets int
droppedTargets int
}
func getMetricRelabelContextByTargetID(targetID string) (*promrelabel.ParsedConfigs, *promutils.Labels, bool) {
tsmGlobal.mu.Lock()
defer tsmGlobal.mu.Unlock()
for sw := range tsmGlobal.m {
// The target is uniquely identified by a pointer to its original labels.
if getLabelsID(sw.Config.OriginalLabels) == targetID {
return sw.Config.MetricRelabelConfigs, sw.Config.Labels, true
}
}
return nil, nil, false
}
func getTargetRelabelContextByTargetID(targetID string) (*promrelabel.ParsedConfigs, *promutils.Labels, bool) {
var relabelConfigs *promrelabel.ParsedConfigs
var labels *promutils.Labels
found := false
// Search for relabel context in tsmGlobal (aka active targets)
tsmGlobal.mu.Lock()
for sw := range tsmGlobal.m {
// The target is uniquely identified by a pointer to its original labels.
if getLabelsID(sw.Config.OriginalLabels) == targetID {
relabelConfigs = sw.Config.RelabelConfigs
labels = sw.Config.OriginalLabels
found = true
break
}
}
tsmGlobal.mu.Unlock()
if found {
return relabelConfigs, labels, true
}
// Search for relabel context in droppedTargetsMap (aka deleted targets)
droppedTargetsMap.mu.Lock()
for _, dt := range droppedTargetsMap.m {
if getLabelsID(dt.originalLabels) == targetID {
relabelConfigs = dt.relabelConfigs
labels = dt.originalLabels
found = true
break
}
}
droppedTargetsMap.mu.Unlock()
return relabelConfigs, labels, found
}
func (tsr *targetsStatusResult) getTargetLabelsByJob() []*targetLabelsByJob {
byJob := make(map[string]*targetLabelsByJob)
for _, jts := range tsr.jobTargetsStatuses {
jobName := jts.jobName
for _, ts := range jts.targetsStatus {
m := byJob[jobName]
if m == nil {
m = &targetLabelsByJob{
jobName: jobName,
}
byJob[jobName] = m
}
m.activeTargets++
m.targets = append(m.targets, targetLabels{
up: ts.up,
originalLabels: ts.sw.Config.OriginalLabels,
labels: ts.sw.Config.Labels,
})
}
}
for _, dt := range tsr.droppedTargets {
jobName := dt.originalLabels.Get("job")
m := byJob[jobName]
if m == nil {
m = &targetLabelsByJob{
jobName: jobName,
}
byJob[jobName] = m
}
m.droppedTargets++
m.targets = append(m.targets, targetLabels{
originalLabels: dt.originalLabels,
dropReason: dt.dropReason,
})
}
a := make([]*targetLabelsByJob, 0, len(byJob))
for _, tls := range byJob {
a = append(a, tls)
}
sort.Slice(a, func(i, j int) bool {
return a[i].jobName < a[j].jobName
})
return a
}