From b5b3c585b3761ee8a3e97111299ecf04680c61a4 Mon Sep 17 00:00:00 2001 From: Aliaksandr Valialkin Date: Thu, 3 Feb 2022 20:22:35 +0200 Subject: [PATCH] lib/promscrape: show the total number of scrapes and the total number of scrape errors per target at /targets page This information may be useful when debugging unreliable scrape targets --- docs/CHANGELOG.md | 3 +- lib/promscrape/targets_response.qtpl | 74 +-- lib/promscrape/targets_response.qtpl.go | 575 +++++++++++++----------- lib/promscrape/targetstatus.go | 40 +- 4 files changed, 359 insertions(+), 333 deletions(-) diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md index 61d16bcdf..78c98a4ce 100644 --- a/docs/CHANGELOG.md +++ b/docs/CHANGELOG.md @@ -16,7 +16,8 @@ sort: 15 * FEATURE: [vmalert](https://docs.victoriametrics.com/vmalert.html): add ability to configure notifiers (e.g. alertmanager) via a file in the way similar to Prometheus. See [these docs](https://docs.victoriametrics.com/vmalert.html#notifier-configuration-file), [this pull request](https://github.com/VictoriaMetrics/VictoriaMetrics/pull/2127). * FEATURE: [vmalert](https://docs.victoriametrics.com/vmalert.html): add support for Consul service discovery for notifiers. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1947). * FEATURE: [vmalert](https://docs.victoriametrics.com/vmalert.html): add support for specifying Basic Auth password for notifiers via a file. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/1567). -* FEATURE: [vmagent](https://docs.victoriametrics.com/vmagent.html): provide the ability to fetch target responses on behalf of `vmagent`. Click `fetch response` link for the needed target at `/targets` page. This feature may be useful for debugging responses from targets located in isolated environments. +* FEATURE: [vmagent](https://docs.victoriametrics.com/vmagent.html): provide the ability to fetch target responses on behalf of `vmagent` by clicking the `response` link for the needed target at `/targets` page. This feature may be useful for debugging responses from targets located in isolated environments. +* FEATURE: [vmagent](https://docs.victoriametrics.com/vmagent.html): show the total number of scrapes and the total number of scrape errors per target at `/targets` page. This information may be useful when debugging unreliable scrape targets. * BUGFIX: return proper results from `highestMax()` function at [Graphite render API](https://docs.victoriametrics.com/#graphite-render-api-usage). Previously it was incorrectly returning timeseries with min peaks instead of max peaks. * BUGFIX: properly limit indexdb cache sizes. Previously they could exceed values set via `-memory.allowedPercent` and/or `-memory.allowedBytes` when `indexdb` contained many data parts. See [this issue](https://github.com/VictoriaMetrics/VictoriaMetrics/issues/2007). diff --git a/lib/promscrape/targets_response.qtpl b/lib/promscrape/targets_response.qtpl index 9e50fbff7..1d50802ee 100644 --- a/lib/promscrape/targets_response.qtpl +++ b/lib/promscrape/targets_response.qtpl @@ -1,5 +1,8 @@ -{% import "github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal" -%} +{% import ( + "time" + "github.com/VictoriaMetrics/VictoriaMetrics/lib/prompbmarshal" + "github.com/VictoriaMetrics/VictoriaMetrics/lib/promrelabel" +) %} {% stripspace %} @@ -9,18 +12,17 @@ job={%q= js.job %} ({%d js.upCount %}/{%d js.targetsTotal %}{% space %}up) {% newline %} {% for _, ts := range js.targetsStatus %} - {% code - labels := promLabelsString(ts.labels) - ol := promLabelsString(ts.originalLabels) - %} -{%s= "\t" %}state={% if ts.up %}up{% else %}down{% endif %},{% space %} - endpoint={%s= ts.endpoint %},{% space %} - labels={%s= labels %} - {% if showOriginLabels %}, originalLabels={%s= ol %}{% endif %},{% space %} - last_scrape={%f.3 ts.lastScrapeTime.Seconds() %}s ago,{% space %} - scrape_duration={%f.3 ts.scrapeDuration.Seconds() %}s,{% space %} +{%s= "\t" %} + state={% if ts.up %}up{% else %}down{% endif %},{% space %} + endpoint={%s= ts.sw.Config.ScrapeURL %},{% space %} + labels={%s= promLabelsString(promrelabel.FinalizeLabels(nil, ts.sw.Config.Labels)) %},{% space %} + {% if showOriginLabels %}originalLabels={%s= promLabelsString(ts.sw.Config.OriginalLabels) %},{% space %}{% endif %} + scrapes_total={%d ts.scrapesTotal %},{% space %} + scrapes_failed={%d ts.scrapesFailed %},{% space %} + last_scrape={%f.3 ts.getDurationFromLastScrape().Seconds() %}s ago,{% space %} + scrape_duration={%d int(ts.scrapeDuration) %}ms,{% space %} samples_scraped={%d ts.samplesScraped %},{% space %} - error={%q= ts.errMsg %} + error={% if ts.err != nil %}{%s= ts.err.Error() %}{% endif %} {% newline %} {% endfor %} {% endfor %} @@ -65,33 +67,47 @@ job={%q= jobName %} (0/0 up) Endpoint State - Labels - Last Scrape - Scrape Duration - Samples Scraped - Error + Labels + Scrapes + Errors + Last Scrape + Duration + Samples + Last error - {% for j, ts := range js.targetsStatus %} + {% for _, ts := range js.targetsStatus %} + {% code + endpoint := ts.sw.Config.ScrapeURL + targetID := getTargetID(ts.sw) + lastScrapeTime := ts.getDurationFromLastScrape() + %} {% if onlyUnhealthy && ts.up %}{% continue %}{% endif %} - {%s ts.endpoint %} ( - fetch response + {%s endpoint %} ( + response ) {% if ts.up %}UP{% else %}DOWN{% endif %} - {% space %} - {%= formatLabel(ts.labels) %} - `) +//line lib/promscrape/targets_response.qtpl:117 } -//line lib/promscrape/targets_response.qtpl:132 - qw422016.N().S(`}`) -//line lib/promscrape/targets_response.qtpl:134 +//line lib/promscrape/targets_response.qtpl:119 + for _, jobName := range emptyJobs { +//line lib/promscrape/targets_response.qtpl:119 + qw422016.N().S(`

`) +//line lib/promscrape/targets_response.qtpl:122 + qw422016.E().S(jobName) +//line lib/promscrape/targets_response.qtpl:122 + qw422016.N().S(`(0/0 up)

EndpointStateLabelsLast ScrapeScrape DurationSamples ScrapedError
`) +//line lib/promscrape/targets_response.qtpl:138 + } +//line lib/promscrape/targets_response.qtpl:138 + qw422016.N().S(``) +//line lib/promscrape/targets_response.qtpl:141 } -//line lib/promscrape/targets_response.qtpl:134 -func writeformatLabel(qq422016 qtio422016.Writer, labels []prompbmarshal.Label) { -//line lib/promscrape/targets_response.qtpl:134 +//line lib/promscrape/targets_response.qtpl:141 +func WriteTargetsResponseHTML(qq422016 qtio422016.Writer, jts []jobTargetsStatuses, emptyJobs []string, onlyUnhealthy bool) { +//line lib/promscrape/targets_response.qtpl:141 qw422016 := qt422016.AcquireWriter(qq422016) -//line lib/promscrape/targets_response.qtpl:134 - streamformatLabel(qw422016, labels) -//line lib/promscrape/targets_response.qtpl:134 +//line lib/promscrape/targets_response.qtpl:141 + StreamTargetsResponseHTML(qw422016, jts, emptyJobs, onlyUnhealthy) +//line lib/promscrape/targets_response.qtpl:141 qt422016.ReleaseWriter(qw422016) -//line lib/promscrape/targets_response.qtpl:134 +//line lib/promscrape/targets_response.qtpl:141 } -//line lib/promscrape/targets_response.qtpl:134 -func formatLabel(labels []prompbmarshal.Label) string { -//line lib/promscrape/targets_response.qtpl:134 +//line lib/promscrape/targets_response.qtpl:141 +func TargetsResponseHTML(jts []jobTargetsStatuses, emptyJobs []string, onlyUnhealthy bool) string { +//line lib/promscrape/targets_response.qtpl:141 qb422016 := qt422016.AcquireByteBuffer() -//line lib/promscrape/targets_response.qtpl:134 - writeformatLabel(qb422016, labels) -//line lib/promscrape/targets_response.qtpl:134 +//line lib/promscrape/targets_response.qtpl:141 + WriteTargetsResponseHTML(qb422016, jts, emptyJobs, onlyUnhealthy) +//line lib/promscrape/targets_response.qtpl:141 qs422016 := string(qb422016.B) -//line lib/promscrape/targets_response.qtpl:134 +//line lib/promscrape/targets_response.qtpl:141 qt422016.ReleaseByteBuffer(qb422016) -//line lib/promscrape/targets_response.qtpl:134 +//line lib/promscrape/targets_response.qtpl:141 return qs422016 -//line lib/promscrape/targets_response.qtpl:134 +//line lib/promscrape/targets_response.qtpl:141 +} + +//line lib/promscrape/targets_response.qtpl:143 +func streamformatLabel(qw422016 *qt422016.Writer, labels []prompbmarshal.Label) { +//line lib/promscrape/targets_response.qtpl:143 + qw422016.N().S(`{`) +//line lib/promscrape/targets_response.qtpl:145 + for i, label := range labels { +//line lib/promscrape/targets_response.qtpl:146 + qw422016.E().S(label.Name) +//line lib/promscrape/targets_response.qtpl:146 + qw422016.N().S(`=`) +//line lib/promscrape/targets_response.qtpl:146 + qw422016.E().Q(label.Value) +//line lib/promscrape/targets_response.qtpl:147 + if i+1 < len(labels) { +//line lib/promscrape/targets_response.qtpl:147 + qw422016.N().S(`,`) +//line lib/promscrape/targets_response.qtpl:147 + qw422016.N().S(` `) +//line lib/promscrape/targets_response.qtpl:147 + } +//line lib/promscrape/targets_response.qtpl:148 + } +//line lib/promscrape/targets_response.qtpl:148 + qw422016.N().S(`}`) +//line lib/promscrape/targets_response.qtpl:150 +} + +//line lib/promscrape/targets_response.qtpl:150 +func writeformatLabel(qq422016 qtio422016.Writer, labels []prompbmarshal.Label) { +//line lib/promscrape/targets_response.qtpl:150 + qw422016 := qt422016.AcquireWriter(qq422016) +//line lib/promscrape/targets_response.qtpl:150 + streamformatLabel(qw422016, labels) +//line lib/promscrape/targets_response.qtpl:150 + qt422016.ReleaseWriter(qw422016) +//line lib/promscrape/targets_response.qtpl:150 +} + +//line lib/promscrape/targets_response.qtpl:150 +func formatLabel(labels []prompbmarshal.Label) string { +//line lib/promscrape/targets_response.qtpl:150 + qb422016 := qt422016.AcquireByteBuffer() +//line lib/promscrape/targets_response.qtpl:150 + writeformatLabel(qb422016, labels) +//line lib/promscrape/targets_response.qtpl:150 + qs422016 := string(qb422016.B) +//line lib/promscrape/targets_response.qtpl:150 + qt422016.ReleaseByteBuffer(qb422016) +//line lib/promscrape/targets_response.qtpl:150 + return qs422016 +//line lib/promscrape/targets_response.qtpl:150 } diff --git a/lib/promscrape/targetstatus.go b/lib/promscrape/targetstatus.go index 4d5c98677..ee804a326 100644 --- a/lib/promscrape/targetstatus.go +++ b/lib/promscrape/targetstatus.go @@ -126,6 +126,10 @@ func (tsm *targetStatusMap) Update(sw *scrapeWork, group string, up bool, scrape ts.scrapeTime = scrapeTime ts.scrapeDuration = scrapeDuration ts.samplesScraped = samplesScraped + ts.scrapesTotal++ + if !up { + ts.scrapesFailed++ + } ts.err = err tsm.mu.Unlock() } @@ -227,6 +231,8 @@ type targetStatus struct { scrapeTime int64 scrapeDuration int64 samplesScraped int + scrapesTotal int + scrapesFailed int err error } @@ -305,23 +311,11 @@ var droppedTargetsMap = &droppedTargets{ m: make(map[string]droppedTarget), } -type jobTargetStatus struct { - up bool - endpoint string - targetID string - labels []prompbmarshal.Label - originalLabels []prompbmarshal.Label - lastScrapeTime time.Duration - scrapeDuration time.Duration - samplesScraped int - errMsg string -} - type jobTargetsStatuses struct { job string upCount int targetsTotal int - targetsStatus []jobTargetStatus + targetsStatus []targetStatus } func (tsm *targetStatusMap) getTargetsStatusByJob() ([]jobTargetsStatuses, []string) { @@ -340,28 +334,12 @@ func (tsm *targetStatusMap) getTargetsStatusByJob() ([]jobTargetsStatuses, []str return statuses[i].sw.Config.ScrapeURL < statuses[j].sw.Config.ScrapeURL }) ups := 0 - var targetsStatuses []jobTargetStatus + var targetsStatuses []targetStatus for _, ts := range statuses { if ts.up { ups++ } - } - for _, st := range statuses { - errMsg := "" - if st.err != nil { - errMsg = st.err.Error() - } - targetsStatuses = append(targetsStatuses, jobTargetStatus{ - up: st.up, - endpoint: st.sw.Config.ScrapeURL, - targetID: getTargetID(st.sw), - labels: promrelabel.FinalizeLabels(nil, st.sw.Config.Labels), - originalLabels: st.sw.Config.OriginalLabels, - lastScrapeTime: st.getDurationFromLastScrape(), - scrapeDuration: time.Duration(st.scrapeDuration) * time.Millisecond, - samplesScraped: st.samplesScraped, - errMsg: errMsg, - }) + targetsStatuses = append(targetsStatuses, ts) } jts = append(jts, jobTargetsStatuses{ job: job,